Skip to content

Commit 9da0edb

Browse files
committed
feat: extract multiple repos for given owner/orgs by running api command
1 parent ffedb3e commit 9da0edb

3 files changed

Lines changed: 203 additions & 15 deletions

File tree

src/regolith/GHextractor.py

Lines changed: 170 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
#!/usr/bin/env python3
22
import base64
3-
import os
43
import re
54
import tomllib
65
from datetime import datetime # noqa
@@ -13,6 +12,17 @@
1312

1413
class GitHubRepoExtractor:
1514
def __init__(self, owner: str, repo: str, token: Optional[str] = None):
15+
"""Constructor for the GitHubRepoExtractor.
16+
17+
Parameters
18+
----------
19+
owner: str
20+
The name of the owner of the package.
21+
repo: str
22+
The name of the repository.
23+
token: str
24+
The private token for GitHub.
25+
"""
1626
self.owner = owner
1727
self.repo = repo
1828
self.session = requests.Session()
@@ -29,15 +39,45 @@ def _get(self, path: str) -> Any:
2939
return r.json()
3040

3141
def get_repo_metadata(self) -> Dict[str, Any]:
42+
"""Get the metadata of the repository.
43+
44+
Returns
45+
-------
46+
The dictionary of the metadata for the repository.
47+
"""
3248
return self._get(f"/repos/{self.owner}/{self.repo}")
3349

3450
def get_contributors(self) -> List[Dict[str, Any]]:
51+
"""Get the contributors of the repository.
52+
53+
Returns
54+
-------
55+
The list of names of the contributors for the repository.
56+
"""
3557
return self._get(f"/repos/{self.owner}/{self.repo}/contributors")
3658

3759
def get_releases(self) -> List[Dict[str, Any]]:
60+
"""Get the summaries of each release of the repository.
61+
62+
Returns
63+
-------
64+
The dictionary of releases for the repository.
65+
"""
66+
3867
return self._get(f"/repos/{self.owner}/{self.repo}/releases")
3968

4069
def get_file(self, path: str) -> Optional[str]:
70+
"""Get the corresponding file based on path given.
71+
72+
Parameters
73+
----------
74+
path: str
75+
The absolute/relative path of the file.
76+
77+
Returns
78+
-------
79+
The decoded file based on the path.
80+
"""
4181
try:
4282
data = self._get(f"/repos/{self.owner}/{self.repo}/contents/{path}")
4383
content = base64.b64decode(data["content"])
@@ -50,6 +90,17 @@ def get_file(self, path: str) -> Optional[str]:
5090
VERSION_RE = re.compile(r"^(\d+)\.(\d+)\.(\d+)(?:-rc\.(\d+))?$")
5191

5292
def parse_version(self, tag: str) -> Optional[Dict[str, Any]]:
93+
"""Parse the version of the repository with a given tag.
94+
95+
Parameters
96+
----------
97+
tag: str
98+
The tag/version of the package. The default format is <*.*.*>
99+
100+
Returns
101+
-------
102+
The parsed version of the tag
103+
"""
53104
match = self.VERSION_RE.match(tag)
54105
if not match:
55106
return None
@@ -69,6 +120,16 @@ def parse_version(self, tag: str) -> Optional[Dict[str, Any]]:
69120
}
70121

71122
def parse_release(self, release: Dict[str, Any]) -> Optional[Dict[str, Any]]:
123+
"""Parse the releases of the repository.
124+
125+
Parameters
126+
----------
127+
release: The dictionary of all releases for the given repository.
128+
129+
Returns
130+
-------
131+
The parsed dictionary of each release for the given repository.
132+
"""
72133
version = self.parse_version(release["tag_name"])
73134
if not version:
74135
return None
@@ -85,10 +146,22 @@ def parse_release(self, release: Dict[str, Any]) -> Optional[Dict[str, Any]]:
85146
}
86147

87148
def extract_authors(self) -> List[str]:
149+
"""Extract the author of the repository.
150+
151+
Returns
152+
-------
153+
The list of names who are (co)authors of the repository.
154+
"""
88155
contributors = self.get_contributors()
89156
return [contributor["login"] for contributor in contributors]
90157

91158
def extract_releases(self) -> List[Dict[str, Any]]:
159+
"""Extract releases history of the repository.
160+
161+
Returns
162+
-------
163+
The parsed releases of the repository.
164+
"""
92165
releases = self.get_releases()
93166
parsed = []
94167
for release in releases:
@@ -98,6 +171,12 @@ def extract_releases(self) -> List[Dict[str, Any]]:
98171
return parsed
99172

100173
def extract(self) -> Dict[str, Any]:
174+
"""Wrapper of extractor for all metadata of a given repository.
175+
176+
Returns
177+
-------
178+
The dictionary of metadata of the repository.e
179+
"""
101180
repo = self.get_repo_metadata()
102181
pyproject = self.get_file("pyproject.toml")
103182

@@ -111,29 +190,105 @@ def extract(self) -> Dict[str, Any]:
111190
"program_description": (
112191
tomllib.loads(pyproject)["project"]["description"] if pyproject else repo.get("description")
113192
),
193+
"grants": "all",
114194
"release": self.extract_releases(),
115195
}
116196

117197
return data
118198

199+
def get_owner_type(self) -> str:
200+
"""Detect whether the owner is a user or an organization.
201+
202+
Returns
203+
-------
204+
str
205+
"org" or "user"
206+
"""
207+
data = self._get(f"/users/{self.owner}")
208+
return "org" if data.get("type") == "Organization" else "user"
209+
210+
def get_active_repositories_for_owner(self) -> List[Dict[str, Any]]:
211+
"""Get all active repositories for the owner.
212+
213+
Returns
214+
-------
215+
List of repository dictionaries.
216+
"""
217+
owner_type = self.get_owner_type()
218+
page = 1
219+
repos: List[Dict[str, Any]] = []
220+
221+
while True:
222+
if owner_type == "org":
223+
path = f"/orgs/{self.owner}/repos"
224+
else:
225+
path = f"/users/{self.owner}/repos"
226+
227+
response = self._get(f"{path}?per_page=100&page={page}")
228+
if not response:
229+
break
230+
for repo in response:
231+
if not repo.get("archived") and not repo.get("disabled"):
232+
repos.append(repo)
233+
page += 1
234+
return repos
235+
236+
def extract_all_active_repositories(self) -> List[Dict[str, Any]]:
237+
"""Extract metadata for all active repositories under the owner.
238+
239+
Returns
240+
-------
241+
List of extracted repository metadata dictionaries.
242+
"""
243+
repos = self.get_active_repositories_for_owner()
244+
results = []
119245

120-
def main():
121-
import argparse
122-
import json
246+
for repo in repos:
247+
repo_name = repo["name"]
248+
extractor = GitHubRepoExtractor(self.owner, repo_name)
249+
extractor.session = self.session
250+
try:
251+
results.append(extractor.extract())
252+
except Exception as exc:
253+
print(f"Skipping {self.owner}/{repo_name}: {exc}")
254+
return results
123255

124-
parser = argparse.ArgumentParser(description="Extract GitHub repository metadata")
125-
parser.add_argument("repo", help="Repository in form owner/name")
126-
parser.add_argument("--token", help="GitHub token (or set GITHUB_TOKEN)")
127-
args = parser.parse_args()
128256

129-
owner, name = args.repo.split("/", 1)
130-
token = args.token or os.getenv("GITHUB_TOKEN")
257+
def extract_github(
258+
owner: str,
259+
repo: Optional[str] = None,
260+
*,
261+
all_repos: bool = False,
262+
token: Optional[str] = None,
263+
) -> List[Dict[str, Any]]:
264+
"""Programmatic entry point for Regolith."""
265+
if all_repos:
266+
extractor = GitHubRepoExtractor(owner, "", token)
267+
return extractor.extract_all_active_repositories()
268+
else:
269+
if not repo:
270+
raise ValueError("repo must be provided unless --all is set")
271+
extractor = GitHubRepoExtractor(owner, repo, token)
272+
return [extractor.extract()]
131273

132-
extractor = GitHubRepoExtractor(owner, name, token)
133-
data = extractor.extract()
134274

135-
print(json.dumps(data, indent=4))
275+
def to_software_yaml(data):
276+
"""Convert a list of software records into a YAML-ready dictionary
277+
keyed by software ID.
136278
279+
Parameters
280+
----------
281+
data: dict
282+
The list of dicts of software metadata.
137283
138-
if __name__ == "__main__":
139-
main()
284+
Returns
285+
-------
286+
The dicttionary for yaml file of all software.
287+
"""
288+
yaml_data = {}
289+
for entry in data:
290+
full_id = entry["_id"]
291+
software_id = full_id.split(".", 1)[1]
292+
content = {k: v for k, v in entry.items() if k != "_id"}
293+
yaml_data[software_id] = content
294+
return yaml_data

src/regolith/commands.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,13 @@
77
from copy import copy
88
from pprint import pprint
99

10+
import yaml
11+
1012
from regolith import storage
1113
from regolith.builder import BUILDERS, builder
1214
from regolith.deploy import deploy as dploy
1315
from regolith.emailer import emailer
16+
from regolith.GHextractor import extract_github, to_software_yaml
1417
from regolith.helper import FAST_UPDATER_WHITELIST, HELPERS, UPDATER_HELPERS, helpr
1518
from regolith.runcontrol import RunControl
1619
from regolith.tools import string_types
@@ -258,12 +261,32 @@ def validate(rc):
258261
# sys.exit(f"Validation failed on some records")
259262

260263

264+
def ghextractor(rc):
265+
"""Extract GitHub repository metadata and write software YAML."""
266+
owner = rc.owner
267+
repo = getattr(rc, "repo", None)
268+
all_repos = getattr(rc, "all", False)
269+
token = getattr(rc, "token", None) or os.getenv("GITHUB_TOKEN")
270+
data = extract_github(
271+
owner,
272+
repo=repo,
273+
all_repos=all_repos,
274+
token=token,
275+
)
276+
yaml_dict = to_software_yaml(data)
277+
output = getattr(rc, "output", "software.yml")
278+
with open(output, "w", encoding="utf-8") as f:
279+
yaml.safe_dump(yaml_dict, f, sort_keys=False)
280+
print(f"Wrote {output}")
281+
282+
261283
DISCONNECTED_COMMANDS = {
262284
"rc": lambda rc: print(rc._pformat()),
263285
"deploy": deploy,
264286
"store": storage.main,
265287
"json-to-yaml": json_to_yaml,
266288
"yaml-to-json": yaml_to_json,
289+
"gh-extractor": ghextractor,
267290
}
268291

269292
CONNECTED_COMMANDS = {

src/regolith/main.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,16 @@ def create_parser():
254254
default=None,
255255
)
256256

257+
# GitHub extractor subparser
258+
ghe = subp.add_parser(
259+
"gh-extractor",
260+
help="Extract GitHub repository metadata and write software YAML",
261+
)
262+
ghe.add_argument("owner", help="GitHub owner or organization")
263+
ghe.add_argument("--repo", help="Single repository name")
264+
ghe.add_argument("--all", action="store_true", help="Extract all active repositories")
265+
ghe.add_argument("--token", help="GitHub token (or set GITHUB_TOKEN)")
266+
257267
# Validator
258268
val = subp.add_parser("validate", help="Validates db")
259269
val.add_argument(

0 commit comments

Comments
 (0)