11#!/usr/bin/env python3
22import base64
3- import os
43import re
54import tomllib
65from datetime import datetime # noqa
1312
1413class GitHubRepoExtractor :
1514 def __init__ (self , owner : str , repo : str , token : Optional [str ] = None ):
15+ """Constructor for the GitHubRepoExtractor.
16+
17+ Parameters
18+ ----------
19+ owner: str
20+ The name of the owner of the package.
21+ repo: str
22+ The name of the repository.
23+ token: str
24+ The private token for GitHub.
25+ """
1626 self .owner = owner
1727 self .repo = repo
1828 self .session = requests .Session ()
@@ -29,15 +39,45 @@ def _get(self, path: str) -> Any:
2939 return r .json ()
3040
3141 def get_repo_metadata (self ) -> Dict [str , Any ]:
42+ """Get the metadata of the repository.
43+
44+ Returns
45+ -------
46+ The dictionary of the metadata for the repository.
47+ """
3248 return self ._get (f"/repos/{ self .owner } /{ self .repo } " )
3349
3450 def get_contributors (self ) -> List [Dict [str , Any ]]:
51+ """Get the contributors of the repository.
52+
53+ Returns
54+ -------
55+ The list of names of the contributors for the repository.
56+ """
3557 return self ._get (f"/repos/{ self .owner } /{ self .repo } /contributors" )
3658
3759 def get_releases (self ) -> List [Dict [str , Any ]]:
60+ """Get the summaries of each release of the repository.
61+
62+ Returns
63+ -------
64+ The dictionary of releases for the repository.
65+ """
66+
3867 return self ._get (f"/repos/{ self .owner } /{ self .repo } /releases" )
3968
4069 def get_file (self , path : str ) -> Optional [str ]:
70+ """Get the corresponding file based on path given.
71+
72+ Parameters
73+ ----------
74+ path: str
75+ The absolute/relative path of the file.
76+
77+ Returns
78+ -------
79+ The decoded file based on the path.
80+ """
4181 try :
4282 data = self ._get (f"/repos/{ self .owner } /{ self .repo } /contents/{ path } " )
4383 content = base64 .b64decode (data ["content" ])
@@ -50,6 +90,17 @@ def get_file(self, path: str) -> Optional[str]:
5090 VERSION_RE = re .compile (r"^(\d+)\.(\d+)\.(\d+)(?:-rc\.(\d+))?$" )
5191
5292 def parse_version (self , tag : str ) -> Optional [Dict [str , Any ]]:
93+ """Parse the version of the repository with a given tag.
94+
95+ Parameters
96+ ----------
97+ tag: str
98+ The tag/version of the package. The default format is <*.*.*>
99+
100+ Returns
101+ -------
102+ The parsed version of the tag
103+ """
53104 match = self .VERSION_RE .match (tag )
54105 if not match :
55106 return None
@@ -69,6 +120,16 @@ def parse_version(self, tag: str) -> Optional[Dict[str, Any]]:
69120 }
70121
71122 def parse_release (self , release : Dict [str , Any ]) -> Optional [Dict [str , Any ]]:
123+ """Parse the releases of the repository.
124+
125+ Parameters
126+ ----------
127+ release: The dictionary of all releases for the given repository.
128+
129+ Returns
130+ -------
131+ The parsed dictionary of each release for the given repository.
132+ """
72133 version = self .parse_version (release ["tag_name" ])
73134 if not version :
74135 return None
@@ -85,10 +146,22 @@ def parse_release(self, release: Dict[str, Any]) -> Optional[Dict[str, Any]]:
85146 }
86147
87148 def extract_authors (self ) -> List [str ]:
149+ """Extract the author of the repository.
150+
151+ Returns
152+ -------
153+ The list of names who are (co)authors of the repository.
154+ """
88155 contributors = self .get_contributors ()
89156 return [contributor ["login" ] for contributor in contributors ]
90157
91158 def extract_releases (self ) -> List [Dict [str , Any ]]:
159+ """Extract releases history of the repository.
160+
161+ Returns
162+ -------
163+ The parsed releases of the repository.
164+ """
92165 releases = self .get_releases ()
93166 parsed = []
94167 for release in releases :
@@ -98,6 +171,12 @@ def extract_releases(self) -> List[Dict[str, Any]]:
98171 return parsed
99172
100173 def extract (self ) -> Dict [str , Any ]:
174+ """Wrapper of extractor for all metadata of a given repository.
175+
176+ Returns
177+ -------
178+ The dictionary of metadata of the repository.e
179+ """
101180 repo = self .get_repo_metadata ()
102181 pyproject = self .get_file ("pyproject.toml" )
103182
@@ -111,29 +190,105 @@ def extract(self) -> Dict[str, Any]:
111190 "program_description" : (
112191 tomllib .loads (pyproject )["project" ]["description" ] if pyproject else repo .get ("description" )
113192 ),
193+ "grants" : "all" ,
114194 "release" : self .extract_releases (),
115195 }
116196
117197 return data
118198
199+ def get_owner_type (self ) -> str :
200+ """Detect whether the owner is a user or an organization.
201+
202+ Returns
203+ -------
204+ str
205+ "org" or "user"
206+ """
207+ data = self ._get (f"/users/{ self .owner } " )
208+ return "org" if data .get ("type" ) == "Organization" else "user"
209+
210+ def get_active_repositories_for_owner (self ) -> List [Dict [str , Any ]]:
211+ """Get all active repositories for the owner.
212+
213+ Returns
214+ -------
215+ List of repository dictionaries.
216+ """
217+ owner_type = self .get_owner_type ()
218+ page = 1
219+ repos : List [Dict [str , Any ]] = []
220+
221+ while True :
222+ if owner_type == "org" :
223+ path = f"/orgs/{ self .owner } /repos"
224+ else :
225+ path = f"/users/{ self .owner } /repos"
226+
227+ response = self ._get (f"{ path } ?per_page=100&page={ page } " )
228+ if not response :
229+ break
230+ for repo in response :
231+ if not repo .get ("archived" ) and not repo .get ("disabled" ):
232+ repos .append (repo )
233+ page += 1
234+ return repos
235+
236+ def extract_all_active_repositories (self ) -> List [Dict [str , Any ]]:
237+ """Extract metadata for all active repositories under the owner.
238+
239+ Returns
240+ -------
241+ List of extracted repository metadata dictionaries.
242+ """
243+ repos = self .get_active_repositories_for_owner ()
244+ results = []
119245
120- def main ():
121- import argparse
122- import json
246+ for repo in repos :
247+ repo_name = repo ["name" ]
248+ extractor = GitHubRepoExtractor (self .owner , repo_name )
249+ extractor .session = self .session
250+ try :
251+ results .append (extractor .extract ())
252+ except Exception as exc :
253+ print (f"Skipping { self .owner } /{ repo_name } : { exc } " )
254+ return results
123255
124- parser = argparse .ArgumentParser (description = "Extract GitHub repository metadata" )
125- parser .add_argument ("repo" , help = "Repository in form owner/name" )
126- parser .add_argument ("--token" , help = "GitHub token (or set GITHUB_TOKEN)" )
127- args = parser .parse_args ()
128256
129- owner , name = args .repo .split ("/" , 1 )
130- token = args .token or os .getenv ("GITHUB_TOKEN" )
257+ def extract_github (
258+ owner : str ,
259+ repo : Optional [str ] = None ,
260+ * ,
261+ all_repos : bool = False ,
262+ token : Optional [str ] = None ,
263+ ) -> List [Dict [str , Any ]]:
264+ """Programmatic entry point for Regolith."""
265+ if all_repos :
266+ extractor = GitHubRepoExtractor (owner , "" , token )
267+ return extractor .extract_all_active_repositories ()
268+ else :
269+ if not repo :
270+ raise ValueError ("repo must be provided unless --all is set" )
271+ extractor = GitHubRepoExtractor (owner , repo , token )
272+ return [extractor .extract ()]
131273
132- extractor = GitHubRepoExtractor (owner , name , token )
133- data = extractor .extract ()
134274
135- print (json .dumps (data , indent = 4 ))
275+ def to_software_yaml (data ):
276+ """Convert a list of software records into a YAML-ready dictionary
277+ keyed by software ID.
136278
279+ Parameters
280+ ----------
281+ data: dict
282+ The list of dicts of software metadata.
137283
138- if __name__ == "__main__" :
139- main ()
284+ Returns
285+ -------
286+ The dicttionary for yaml file of all software.
287+ """
288+ yaml_data = {}
289+ for entry in data :
290+ full_id = entry ["_id" ]
291+ software_id = full_id .split ("." , 1 )[1 ]
292+ content = {k : v for k , v in entry .items () if k != "_id" }
293+ yaml_data [software_id ] = content
294+ return yaml_data
0 commit comments