-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPipeline.py
More file actions
97 lines (85 loc) · 2.81 KB
/
Pipeline.py
File metadata and controls
97 lines (85 loc) · 2.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from core import Repository, Extractor, Processor, PySZZ, Splitter
from utils import clone_repo
class BasicPipeline:
def __init__(self, cfg):
# init extractor
self.extractor = Extractor(
start=cfg.extractor_start,
end=cfg.extractor_end,
num_commits_per_file=cfg.extractor_num_commits_per_file,
language=cfg.repo_language,
save=cfg.extractor_save,
force_reextract=cfg.extractor_force_reextract,
check_uncommit=cfg.extractor_check_uncommit,
)
# init pyszz
self.pyszz = PySZZ(
pyszz_path=cfg.pyszz_path,
log_path=cfg.pyszz_log_path,
pyszz_conf=cfg.pyszz_conf,
keep_output=cfg.pyszz_keep_output,
)
# init processor
self.processor = Processor(
save_path=cfg.dataset_save_path,
save=cfg.processor_save,
)
# init splitter
self.splitter = Splitter(save_path=cfg.dataset_save_path)
def set_repo(self, cfg):
assert cfg.mode in ["local", "remote"], "Invalid mode: {}".format(cfg.mode)
if cfg.mode == "local":
self.repo = self.local_repo(cfg)
else:
self.repo = self.remote_repo(cfg)
def local_repo(self, cfg):
repo = Repository(
cfg.repo_owner,
cfg.repo_name,
cfg.repo_save_path,
cfg.repo_path,
cfg.repo_language,
)
return repo
def remote_repo(self, cfg):
clone_repo(
cfg.repo_clone_path,
cfg.repo_owner,
cfg.repo_name,
cfg.repo_clone_url,
)
repo = Repository(
cfg.repo_owner,
cfg.repo_name,
cfg.repo_save_path,
cfg.repo_clone_path,
cfg.repo_language,
)
return repo
def run(self):
print("Running repository: {}/{}".format(self.repo.owner, self.repo.name))
# extract repo
self.extractor.set_repo(self.repo)
self.extractor.run()
# run pyszz
print("Running PySZZ...")
self.pyszz.run(
self.repo.get_bug_fix_path(),
self.repo.get_pyszz_conf_path(self.pyszz.conf),
self.repo.get_repo_path(),
self.repo.get_language(),
)
szz_output = self.pyszz.get_lastest_output(
self.repo.owner,
self.repo.name,
)
print("PySZZ output: {}".format(len(szz_output)))
# process data
print("Processing information...")
self.processor.set_repo(self.repo)
self.processor.run(szz_output, self.extractor.end)
# split data
print("Splitting data...")
self.splitter.set_processor(self.processor)
self.splitter.run()
print("Done")