Skip to content

Commit 8fb3b2d

Browse files
committed
break up plugins
1 parent ef76429 commit 8fb3b2d

99 files changed

Lines changed: 9068 additions & 8559 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

pyglossary/plugins/aard2_slob/__init__.py

Lines changed: 3 additions & 390 deletions
Large diffs are not rendered by default.
Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
# -*- coding: utf-8 -*-
2+
from __future__ import annotations
3+
4+
import re
5+
from typing import TYPE_CHECKING
6+
7+
if TYPE_CHECKING:
8+
from collections.abc import Iterator
9+
10+
from pyglossary import slob
11+
from pyglossary.glossary_types import EntryType, GlossaryType
12+
13+
from pyglossary.core import exc_note, log, pip
14+
from pyglossary.plugins.aard2_slob.tags import (
15+
supported_tags,
16+
t_copyright,
17+
t_created_at,
18+
t_created_by,
19+
t_edition,
20+
t_label,
21+
t_license_name,
22+
t_license_url,
23+
t_uri,
24+
)
25+
26+
27+
class Reader:
28+
depends = {
29+
"icu": "PyICU", # >=1.5
30+
}
31+
32+
def __init__(self, glos: GlossaryType) -> None:
33+
self._glos = glos
34+
self._clear()
35+
self._re_bword = re.compile(
36+
"(<a href=[^<>]+?>)",
37+
re.IGNORECASE,
38+
)
39+
40+
def close(self) -> None:
41+
if self._slobObj is not None:
42+
self._slobObj.close()
43+
self._clear()
44+
45+
def _clear(self) -> None:
46+
self._filename = ""
47+
self._slobObj: slob.Slob | None = None
48+
49+
# TODO: PLR0912 Too many branches (13 > 12)
50+
def open(self, filename: str) -> None: # noqa: PLR0912
51+
try:
52+
import icu # type: ignore # noqa: F401
53+
except ModuleNotFoundError as e:
54+
exc_note(e, f"Run `{pip} install PyICU` to install")
55+
raise
56+
from pyglossary import slob
57+
58+
self._filename = filename
59+
self._slobObj = slob.open(filename)
60+
tags = dict(self._slobObj.tags.items())
61+
62+
if t_label in tags:
63+
self._glos.setInfo("name", tags[t_label])
64+
65+
if t_created_at in tags:
66+
self._glos.setInfo("creationTime", tags[t_created_at])
67+
68+
if t_created_by in tags:
69+
self._glos.setInfo("author", tags[t_created_by])
70+
71+
copyrightLines: list[str] = []
72+
for key in (t_copyright, t_license_name, t_license_url):
73+
try:
74+
value = tags.pop(key)
75+
except KeyError:
76+
continue
77+
copyrightLines.append(value)
78+
if copyrightLines:
79+
self._glos.setInfo("copyright", "\n".join(copyrightLines))
80+
81+
if t_uri in tags:
82+
self._glos.setInfo("website", tags[t_uri])
83+
84+
if t_edition in tags:
85+
self._glos.setInfo("edition", tags[t_edition])
86+
87+
for key, value in tags.items():
88+
if key in supported_tags:
89+
continue
90+
self._glos.setInfo(f"slob.{key}", value)
91+
92+
def __len__(self) -> int:
93+
if self._slobObj is None:
94+
log.error("called len() on a reader which is not open")
95+
return 0
96+
return len(self._slobObj)
97+
98+
@staticmethod
99+
def _href_sub(m: re.Match) -> str:
100+
st = m.group(0)
101+
if "//" in st:
102+
return st
103+
return st.replace('href="', 'href="bword://').replace(
104+
"href='",
105+
"href='bword://",
106+
)
107+
108+
def __iter__(self) -> Iterator[EntryType | None]:
109+
from pyglossary.slob import MIME_HTML, MIME_TEXT
110+
111+
if self._slobObj is None:
112+
raise RuntimeError("iterating over a reader while it's not open")
113+
114+
slobObj = self._slobObj
115+
blobSet = set()
116+
117+
# slob library gives duplicate blobs when iterating over slobObj
118+
# even keeping the last id is not enough, since duplicate blobs
119+
# are not all consecutive. so we have to keep a set of blob IDs
120+
121+
for blob in slobObj:
122+
id_ = blob.identity
123+
if id_ in blobSet:
124+
yield None # update progressbar
125+
continue
126+
blobSet.add(id_)
127+
128+
# blob.key is str, blob.content is bytes
129+
word = blob.key
130+
131+
ctype = blob.content_type.split(";")[0]
132+
if ctype not in {MIME_HTML, MIME_TEXT}:
133+
log.debug(f"unknown {blob.content_type=} in {word=}")
134+
word = word.removeprefix("~/")
135+
yield self._glos.newDataEntry(word, blob.content)
136+
continue
137+
defiFormat = ""
138+
if ctype == MIME_HTML:
139+
defiFormat = "h"
140+
elif ctype == MIME_TEXT:
141+
defiFormat = "m"
142+
143+
defi = blob.content.decode("utf-8")
144+
defi = self._re_bword.sub(self._href_sub, defi)
145+
yield self._glos.newEntry(word, defi, defiFormat=defiFormat)
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
t_created_at = "created.at"
2+
t_label = "label"
3+
t_created_by = "created.by"
4+
t_copyright = "copyright"
5+
t_license_name = "license.name"
6+
t_license_url = "license.url"
7+
t_uri = "uri"
8+
t_edition = "edition"
9+
10+
supported_tags = {
11+
t_label,
12+
t_created_at,
13+
t_created_by,
14+
t_copyright,
15+
t_uri,
16+
t_edition,
17+
}
18+
19+
__all__ = [
20+
"supported_tags",
21+
"t_copyright",
22+
"t_created_at",
23+
"t_created_by",
24+
"t_edition",
25+
"t_label",
26+
"t_license_name",
27+
"t_license_url",
28+
"t_uri",
29+
]

0 commit comments

Comments
 (0)