-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathLoadData.py
More file actions
153 lines (147 loc) · 4.44 KB
/
LoadData.py
File metadata and controls
153 lines (147 loc) · 4.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import numpy as np
import Agent
"""
Machine event:
1. timestamp
2. machine ID
3. event type
ADD (0): A machine became available to the cluster - all machines in the trace will have an ADD event.
REMOVE (1): A machine was removed from the cluster. Removals can occur due to failures or maintenance.
UPDATE (2): A machine available to the cluster had its available resources changed.
4. platform ID
Two machines with the same platform ID may have substantially different clock rates, memory speeds, core counts, etc.
5. capacity: CPU
6. capacity: memory
"""
"""
Machine attributes table:
1. timestamp
2. machine ID
3. attribute name: an opaque string
4. attribute value: either an opaque string or an integer
5. attribute deleted: a boolean indicating whether the attribute was deleted
"""
def Load_Machine():
with open('clusterdata-2011-2/machine_attributes/part-00000-of-00001.csv') as f:
# timestamp = []
# machineID = []
# attr_name = []
# attr_val = []
# attr_dele = []
cnt = 0
while True:
try:
line = f.readline()
line = line[:-1].split(',')
timestamp, machineID, attr_name, attr_val, attr_dele = line
while cnt < 10:
print(timestamp)
cnt += 1
except:
break
"""
Job events table
1. timestamp
2. missing info
3. job ID
4. event type
5. user name
6. scheduling class
7. job name: base64-encoded strings
8. logical job name
"""
def Load_Job():
with open('clusterdata-2011-2/machine_attributes/part-00000-of-00001.csv') as f:
# cnt = 0
while True:
try:
line = f.readline()
line = line[:-1].split(',')
timestamp, machineID, attr_name, attr_val, attr_dele = line
# while cnt < 10:
# print(timestamp)
# cnt += 1
except:
break
"""
Task events table
1. timestamp
2. missing info
3. job ID
4. task index - within the job
5. machine ID
6. event type
7. user name
8. scheduling class
9. priority
10. resource request for CPU cores
11. resource request for RAM
12. resource request for local disk space
13. different-machine constraint
"""
def Load_Task_Event():
with open('clusterdata-2011-2/task_events/part-00000-of-00500.csv') as f:
cnt = 0
while True:
try:
line = f.readline()
line = line[:-1].split(',')
# timestamp, machineID, attr_name, attr_val, attr_dele = line
# while cnt < 10:
# print(timestamp)
# cnt += 1
except:
break
"""
Task constraints table
1. timestamp
2. job ID
3. task index
4. attribute name -- corresponds to machine attribute table
5. attribute value -- either an opaque string or an integer or the empty string
6. comparison operator
"""
def Load_Task_Constraint():
with open('clusterdata-2011-2/task_constraints/part-00000-of-00500.csv') as f:
cnt = 0
while True:
try:
line = f.readline()
line = line[:-1].split(',')
except:
break
"""
Task resource usage table
1. start time of the measurement period
2. end time of the measurement period
3. job ID
4. task index
5. machine ID
6. mean CPU usage rate
7. canonical memory usage
8. assigned memory usage
9. unmapped page cache memory usage
10.total page cache memory usage
11. maximum memory usage
12. mean disk I/O time
13. mean local disk space used
14. maximum CPU usage
15. maximum disk IO time
16. cycles per instruction (CPI)
17. memory accesses per instruction (MAI)
18. sample portion
19. aggregation type (1 if maximums from subcontainers were summed)
20. sampled CPU usage: mean CPU usage during a random 1s sample in the
measurement period
"""
def Load_Task_Resource():
with open('clusterdata-2011-2/task_usage/part-00000-of-00500.csv') as f:
# cnt = 0
while True:
try:
line = f.readline()
line = line[:-1].split(',')
except:
break
if __name__ == '__main__':
Load_Machine()