-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathRawLoadViaJSONFile.py
More file actions
103 lines (96 loc) · 6.83 KB
/
RawLoadViaJSONFile.py
File metadata and controls
103 lines (96 loc) · 6.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import re, pandas as pd, numpy as np, os, json
from io import StringIO
from IPython.core.display import display
import time # time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(1634309423832/1000))
folderEventSpecs = os.path.realpath('../ESOUIDocumentation/')
folderRawLogs = os.path.realpath('../AnyLoggerRawLuaDumps/')
# folderEventData = 'C:/Beachhead/Misc/LogTinker/outdata/'
folderEventData = os.path.realpath('../Sandbox_Brian/EventData/')
folderEventData = os.path.realpath('C:\FolderToPutLogFiles\EsoAnalytics\Sandbox_Brian\EventData')
def removeMatch(indict): return {key:value for (key,value) in indict.items() if key!=value}
def switchType(intype):
if intype == 'boolean': return 'category'
elif intype == 'string': return 'str'
# elif intype == 'category': return 'str' # testing something
else: return intype
csvdtype = {'seq': int, 'timestamp': 'int64', 'player': 'category', 'event': 'category'}
csvNumArgs = ["arg"+str(i) for i in range(1, 25)]
csvdtype.update({arg:'str' for arg in csvNumArgs})
csvcols = ";".join(csvdtype.keys())
rawFmtCSV = re.compile("\[(\d+)\]\s?=\s?\"(.*?)\",\s*")
def loadRaw(sourcefilename):
with open(sourcefilename, "r", encoding='utf8') as sourcefile:
pseudocsv = sourcefile.read()
pseudocsv = ("\n".join([seqAndData[0]+";"+seqAndData[1] for seqAndData in rawFmtCSV.findall(pseudocsv.replace("\\\"",":quot:"))])).replace(":quot:","\\\"")
return pd.read_csv(StringIO(csvcols+"\n"+pseudocsv), dtype=csvdtype, sep=";", header=0)
def getAllEventArgs():
retval = {}
# if not os.path.exists(folderEventData+'/'+'ALL'): os.makedirs(folderEventData+'/'+'ALL')
for eventName, eventArgDetails in json.load(open(folderEventSpecs+'/'+'FinalEventArgsInfo.json')).items():
# if eventName == 'EVENT_GLOBAL_MOUSE_DOWN' : # 'EVENT_ABILITY_PROGRESSION_RESULT': # "EVENT_ABILITY_PROGRESSION_XP_UPDATE"
# if eventName != 'EVENT_KEYBINDING_SET': # gives error using eventArgConstmap
destEventPath = folderEventData+'/'+eventName
eventDfCols = ['seq', 'timestamp', 'player']
eventDfCols.extend(eventArgDetails.keys())
eventArgRename = {argNum:argDetail['name'] for argNum, argDetail in eventArgDetails.items()}
# eventArgConstmap = {argDetail['name']:removeMatch(argDetail['constmap']) for argDetail in eventArgDetails.values() if 'constmap' in argDetail}
eventArgConstmap = {argDetail['name']:argDetail['constmap'] for argDetail in eventArgDetails.values() if 'constmap' in argDetail}
eventArgTypes = {argDetail['name']:switchType(argDetail['pandastype']) for argDetail in eventArgDetails.values()}
retval[eventName] = { 'eventName':eventName, 'destEventPath':destEventPath, 'eventDfCols':eventDfCols, 'eventArgConstmap':eventArgConstmap, 'eventArgRename':eventArgRename, 'eventArgTypes':eventArgTypes}
return retval
allEventArgs = getAllEventArgs()
if not os.path.exists(folderEventData+'/'+'ALL'): os.makedirs(folderEventData+'/'+'ALL')
cntByEventFile = {}
datByEventFile = {}
def mergeDataFrameIntoFilePrep(eventDataFrame, fileNameEventDate): # pd.read_parquet(fileNameEventDate).append(eventDataFrame).drop_duplicates(subset=["seq", "timestamp", "player"]).to_parquet(fileNameEventDate)
if fileNameEventDate in datByEventFile:
datByEventFile[fileNameEventDate] = datByEventFile[fileNameEventDate].append(eventDataFrame)
else:
try: datByEventFile[fileNameEventDate] = pd.read_parquet(fileNameEventDate); cntByEventFile[fileNameEventDate] = len(datByEventFile[fileNameEventDate])
except FileNotFoundError: datByEventFile[fileNameEventDate] = eventDataFrame; cntByEventFile[fileNameEventDate] = 0 # set to 0 since nothing was in the file
lstNotInJSON = set()
# # Preload ALL files as new (only needed if reloading everything)
# for allFile in [f for f in os.listdir(folderEventData+'/'+'ALL') if f.endswith('.parquet')]:
# # if not allFile.startswith('2021-10-16') :
# begintime = int(round(time.time() * 1000))
# eventDataFrame = pd.read_parquet(folderEventData+'/'+'ALL'+'/'+allFile)
# fileprefix = time.strftime('%Y-%m-%d', time.localtime(eventDataFrame['timestamp'].values[-1]/1000))
# fileNameEventDate = folderEventData+'/'+'ALL'+'/'+fileprefix+'.parquet'
# mergeDataFrameIntoFilePrep(eventDataFrame, fileNameEventDate)
# cntByEventFile[fileNameEventDate] = 0
# print("PreLoaded:"+allFile, (int(round(time.time() * 1000))-begintime))
rawfiles = [f for f in os.listdir(folderRawLogs) if f.endswith('.lua') or f.endswith('.lson')]
for rawfile in rawfiles:
# if rawfile.startswith('2021-10-18_20-11-36_DOWELL-ORIGIN.lua') :
# if rawfile.startswith('2021-10-16_20-19-06_SAMANTHA-GAMING.lua') :
# if rawfile.startswith('NA') :
begintime = int(round(time.time() * 1000))
eventDataFrame = loadRaw(folderRawLogs+'/'+rawfile)
fileprefix = time.strftime('%Y-%m-%d', time.localtime(eventDataFrame['timestamp'].values[-1]/1000))
fileNameEventDate = folderEventData+'/'+'ALL'+'/'+fileprefix+'.parquet'
mergeDataFrameIntoFilePrep(eventDataFrame, fileNameEventDate)
print("Loaded from raw:"+rawfile, (int(round(time.time() * 1000))-begintime))
lstALLFiles = dict(datByEventFile)
for (fileNameEventDate, eventDataFrame) in lstALLFiles.items():
begintime = int(round(time.time() * 1000))
fileprefix = time.strftime('%Y-%m-%d', time.localtime(eventDataFrame['timestamp'].values[-1]/1000))
datByEventFile[fileNameEventDate].drop_duplicates(subset=["player", "timestamp", "seq"], inplace=True)
indDataIsNew = (len(datByEventFile[fileNameEventDate]) != cntByEventFile[fileNameEventDate])
print(fileNameEventDate+" is new "+str(indDataIsNew))
if indDataIsNew : # You'll want an indicator for handling if reprocessing for a changed JSON
for eventName, grpdf in eventDataFrame.groupby('event'):
if eventName in allEventArgs:
e = allEventArgs[eventName]
eventDataFrame = pd.DataFrame(grpdf, columns=e['eventDfCols']).rename(columns=e['eventArgRename']).replace(to_replace=e['eventArgConstmap']).astype(dtype=e['eventArgTypes'])
mergeDataFrameIntoFilePrep(eventDataFrame, e['destEventPath']+'/'+fileprefix+'.parquet')
else: lstNotInJSON.add(eventName)
print("Parsed to event folders:"+fileNameEventDate, (int(round(time.time() * 1000))-begintime))
begintime = int(round(time.time() * 1000))
lstDirsToMake = set(os.path.dirname(key) for (key,value) in cntByEventFile.items() if value == 0 and not os.path.exists(os.path.dirname(key)))
for f in lstDirsToMake: os.makedirs(f)
# write files where data has changed
for (fileNameEventDate, eventDataFrame) in datByEventFile.items():
eventDataFrame.drop_duplicates(subset=["player", "timestamp", "seq"], inplace=True)
if len(eventDataFrame) != cntByEventFile[fileNameEventDate] :
eventDataFrame.to_parquet(fileNameEventDate)
print("Done writing files. ", (int(round(time.time() * 1000))-begintime))