|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | + |
| 3 | +# This program is free software; you can redistribute it and/or modify |
| 4 | +# it under the terms of the GNU General Public License version 2 as |
| 5 | +# published by the Free Software Foundation. |
| 6 | + |
| 7 | +"""Extractors for https://redbust.com/""" |
| 8 | + |
| 9 | +from .common import GalleryExtractor, Extractor, Message |
| 10 | +from .. import text |
| 11 | + |
| 12 | +BASE_PATTERN = r"(?:https?://)?redbust\.com" |
| 13 | + |
| 14 | + |
| 15 | +class RedbustExtractor(Extractor): |
| 16 | + """Base class for RedBust extractors""" |
| 17 | + category = "redbust" |
| 18 | + root = "https://redbust.com" |
| 19 | + filename_fmt = "{filename}.{extension}" |
| 20 | + |
| 21 | + def items(self): |
| 22 | + data = {"_extractor": RedbustGalleryExtractor} |
| 23 | + for url in self.galleries(): |
| 24 | + yield Message.Queue, url, data |
| 25 | + |
| 26 | + def _pagination(self, path, page=None): |
| 27 | + if page is None: |
| 28 | + url = f"{self.root}{path}/" |
| 29 | + base = url + "page/" |
| 30 | + page = self.request(url).text |
| 31 | + else: |
| 32 | + base = f"{self.root}{path}/page/" |
| 33 | + |
| 34 | + pnum = 1 |
| 35 | + while True: |
| 36 | + for post in text.extract_iter( |
| 37 | + page, '<h2 class="post-title">', "rel="): |
| 38 | + yield text.extr(post, 'href="', '"') |
| 39 | + |
| 40 | + pnum += 1 |
| 41 | + url = f"{base}{pnum}/" |
| 42 | + if url not in page: |
| 43 | + return |
| 44 | + page = self.request(url).text |
| 45 | + |
| 46 | + |
| 47 | +class RedbustGalleryExtractor(GalleryExtractor, RedbustExtractor): |
| 48 | + """Extractor for RedBust galleries""" |
| 49 | + pattern = BASE_PATTERN + r"/([\w-]+)/?$" |
| 50 | + example = "https://redbust.com/TITLE/" |
| 51 | + |
| 52 | + def items(self): |
| 53 | + url = f"{self.root}/{self.groups[0]}/" |
| 54 | + self.page = page = self.request(url).text |
| 55 | + |
| 56 | + self.gallery_id = gid = text.extr( |
| 57 | + page, "<link rel='shortlink' href='https://redbust.com/?p=", "'") |
| 58 | + |
| 59 | + if gid: |
| 60 | + self.gallery_url = False |
| 61 | + return GalleryExtractor.items(self) |
| 62 | + else: |
| 63 | + self.subcategory = "category" |
| 64 | + return self._items_category(page) |
| 65 | + |
| 66 | + def _items_category(self, _): |
| 67 | + page = self.page |
| 68 | + data = {"_extractor": RedbustGalleryExtractor} |
| 69 | + base = f"{self.root}/{self.groups[0]}/page/" |
| 70 | + pnum = 1 |
| 71 | + |
| 72 | + while True: |
| 73 | + for post in text.extract_iter( |
| 74 | + page, '<h2 class="post-title">', "rel="): |
| 75 | + url = text.extr(post, 'href="', '"') |
| 76 | + yield Message.Queue, url, data |
| 77 | + |
| 78 | + pnum += 1 |
| 79 | + url = f"{base}{pnum}/" |
| 80 | + if url not in page: |
| 81 | + return |
| 82 | + page = self.request(url).text |
| 83 | + |
| 84 | + def metadata(self, _): |
| 85 | + extr = text.extract_from(self.page) |
| 86 | + |
| 87 | + return { |
| 88 | + "gallery_id" : self.gallery_id, |
| 89 | + "gallery_slug": self.groups[0], |
| 90 | + "categories" : text.split_html(extr( |
| 91 | + '<li class="category">', "</li>"))[::2], |
| 92 | + "title" : text.unescape(extr('class="post-title">', "<")), |
| 93 | + "date" : text.parse_datetime( |
| 94 | + extr('class="post-byline">', "<").strip(), "%B %d, %Y"), |
| 95 | + "views" : text.parse_int(extr("</b>", "v").replace(",", "")), |
| 96 | + "tags" : text.split_html(extr( |
| 97 | + 'class="post-tags">', "</p"))[1:], |
| 98 | + } |
| 99 | + |
| 100 | + def images(self, _): |
| 101 | + results = [] |
| 102 | + |
| 103 | + for img in text.extract_iter(self.page, "'><img ", ">"): |
| 104 | + if src := text.extr(img, 'src="', '"'): |
| 105 | + path, _, end = src.rpartition("-") |
| 106 | + if "x" in end: |
| 107 | + url = f"{path}.{end.rpartition('.')[2]}" |
| 108 | + data = None if src == url else {"_fallback": (src,)} |
| 109 | + else: |
| 110 | + url = src |
| 111 | + data = None |
| 112 | + results.append((url, data)) |
| 113 | + |
| 114 | + if not results: |
| 115 | + # fallback for older galleries |
| 116 | + for path in text.extract_iter( |
| 117 | + self.page, '<img src="/wp-content/uploads/', '"'): |
| 118 | + results.append( |
| 119 | + (f"{self.root}/wp-content/uploads/{path}", None)) |
| 120 | + |
| 121 | + return results |
| 122 | + |
| 123 | + |
| 124 | +class RedbustTagExtractor(RedbustExtractor): |
| 125 | + """Extractor for RedBust tag searches""" |
| 126 | + subcategory = "tag" |
| 127 | + pattern = BASE_PATTERN + r"/tag/([\w-]+)" |
| 128 | + example = "https://redbust.com/tag/TAG/" |
| 129 | + |
| 130 | + def galleries(self): |
| 131 | + return self._pagination("/tag/" + self.groups[0]) |
| 132 | + |
| 133 | + |
| 134 | +class RedbustArchiveExtractor(RedbustExtractor): |
| 135 | + """Extractor for RedBust monthly archive collections""" |
| 136 | + subcategory = "archive" |
| 137 | + pattern = BASE_PATTERN + r"(/\d{4}/\d{2})" |
| 138 | + example = "https://redbust.com/2010/01/" |
| 139 | + |
| 140 | + def galleries(self): |
| 141 | + return self._pagination(self.groups[0]) |
| 142 | + |
| 143 | + |
| 144 | +class RedbustImageExtractor(RedbustExtractor): |
| 145 | + """Extractor for RedBust images""" |
| 146 | + subcategory = "image" |
| 147 | + directory_fmt = ("{category}", "{title}") |
| 148 | + pattern = BASE_PATTERN + r"/(?!tag/|\d{4}/)([\w-]+)/([\w-]+)/?$" |
| 149 | + example = "https://redbust.com/TITLE/SLUG/" |
| 150 | + |
| 151 | + def items(self): |
| 152 | + gallery_slug, image_slug = self.groups |
| 153 | + url = f"{self.root}/{gallery_slug}/{image_slug}/" |
| 154 | + page = self.request(url).text |
| 155 | + |
| 156 | + img_url = None |
| 157 | + |
| 158 | + # Look for the largest image in srcset first |
| 159 | + if srcset := text.extr(page, 'srcset="', '"'): |
| 160 | + # Extract the largest image from srcset (typically last one) |
| 161 | + urls = srcset.split(", ") |
| 162 | + img_url = urls[-1].partition(" ")[0] if urls else None |
| 163 | + |
| 164 | + # Fallback to original extraction method |
| 165 | + if not img_url: |
| 166 | + if entry := text.extr(page, "entry-inner ", "alt="): |
| 167 | + img_url = text.extr(entry, "img src=", " ").strip("\"'") |
| 168 | + |
| 169 | + if not img_url: |
| 170 | + return |
| 171 | + |
| 172 | + end = img_url.rpartition("-")[2] |
| 173 | + data = text.nameext_from_url(img_url, { |
| 174 | + "title" : text.unescape(text.extr( |
| 175 | + page, 'title="Return to ', '"')), |
| 176 | + "image_id" : text.extr( |
| 177 | + page, "rel='shortlink' href='https://redbust.com/?p=", "'"), |
| 178 | + "gallery_slug": gallery_slug, |
| 179 | + "image_slug" : image_slug, |
| 180 | + "num" : text.parse_int(end.partition(".")[0]), |
| 181 | + "count" : 1, |
| 182 | + "url" : img_url, |
| 183 | + }) |
| 184 | + |
| 185 | + yield Message.Directory, data |
| 186 | + yield Message.Url, img_url, data |
0 commit comments