Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions gallery_dl/extractor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@
"nozomi",
"nsfwalbum",
"nudostar",
"nudostarforum",
"okporn",
"paheal",
"patreon",
Expand Down
201 changes: 201 additions & 0 deletions gallery_dl/extractor/nudostarforum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
# -*- coding: utf-8 -*-

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://nudostar.com/forum/"""

from .common import Extractor, Message
from .. import text, exception
from ..cache import cache

BASE_PATTERN = r"(?:https?://)?(?:www\.)?nudostar\.com/forum"


class NudostarforumExtractor(Extractor):
"""Base class for nudostar forum extractors"""
category = "nudostarforum"
cookies_domain = "nudostar.com"
cookies_names = ("xf_user",)
root = "https://nudostar.com/forum"
directory_fmt = ("{category}", "{thread[title]} ({thread[id]})")
filename_fmt = "{post[id]}_{num:>02}_{filename}.{extension}"
archive_fmt = "{post[id]}/{filename}"

def items(self):
self.login()

for post in self.posts():
internal, external = self._extract_post_urls(post["content"])

data = {"post": post}
post["count"] = data["count"] = len(internal) + len(external)
yield Message.Directory, "", data

data["num"] = 0
for url in internal:
data["num"] += 1
text.nameext_from_url(url, data)
yield Message.Url, url, data

for url in external:
data["num"] += 1
yield Message.Queue, url, data

def _extract_post_urls(self, content):
"""Extract image and video URLs from post content"""
internal = []
external = []
seen = set()

# Extract URLs from both href= and src= attributes
for attr in ('href="', 'src="'):
for url in text.extract_iter(content, attr, '"'):
if url in seen:
continue

# Internal attachments
if "/forum/attachments/" in url:
# Skip numeric-only IDs and non-file links
path = url.rstrip("/")
if path.split(".")[-1].isdigit() and "-" not in path:
continue
if "upload?" in url:
continue
seen.add(url)
# Normalize to full URL
if url.startswith("/"):
url = "https://nudostar.com" + url
internal.append(url)

# External image hosts
elif url.startswith("http") and "nudostar.com" not in url:
seen.add(url)
external.append(url)

return internal, external

def request_page(self, url):
try:
return self.request(url)
except exception.HttpError as exc:
if exc.status == 403:
raise exception.AuthRequired(
("username & password", "authenticated cookies"), None,
"Login required to view this content")
raise

def login(self):
if self.cookies_check(self.cookies_names):
return

username, password = self._get_auth_info()
if username:
self.cookies_update(self._login_impl(username, password))

@cache(maxage=365*86400, keyarg=1)
def _login_impl(self, username, password):
self.log.info("Logging in as %s", username)

url = f"{self.root}/login/"
page = self.request(url).text
token = text.extr(page, 'name="_xfToken" value="', '"')

url = f"{self.root}/login/login"
data = {
"_xfToken" : token,
"login" : username,
"password" : password,
"remember" : "1",
"_xfRedirect": self.root + "/",
}
response = self.request(url, method="POST", data=data)

if not response.history or "xf_user" not in response.cookies:
raise exception.AuthenticationError()

return {
cookie.name: cookie.value
for cookie in self.cookies
if cookie.domain.endswith(self.cookies_domain)
}

def _pagination(self, base, pnum=None):
if pnum is None:
url = f"{self.root}{base}/"
pnum = 1
else:
url = f"{self.root}{base}/page-{pnum}"
pnum = None

while True:
page = self.request_page(url).text
yield page

if pnum is None or "pageNav-jump--next" not in page:
return
pnum += 1
url = f"{self.root}{base}/page-{pnum}"

def _parse_thread(self, page):
extr = text.extract_from(page)

title = text.unescape(extr("<title>", "<"))
if " | " in title:
title = title.rpartition(" | ")[0]

thread_id = extr('data-content-key="thread-', '"')

return {
"id" : thread_id,
"title": title.strip(),
}

def _parse_post(self, html):
extr = text.extract_from(html)

return {
"author": extr('data-author="', '"'),
"id" : extr('data-content="post-', '"'),
"date" : extr('datetime="', '"'),
"content": html, # Pass full article HTML for URL extraction
}


class NudostarforumPostExtractor(NudostarforumExtractor):
"""Extractor for individual posts on nudostar forum"""
subcategory = "post"
pattern = (rf"{BASE_PATTERN}"
rf"/threads/[^/?#]+\.(\d+)/post-(\d+)")
example = "https://nudostar.com/forum/threads/NAME.12345/post-67890"

def posts(self):
thread_id, post_id = self.groups
url = f"{self.root}/posts/{post_id}/"
page = self.request_page(url).text

pos = page.find(f'data-content="post-{post_id}"')
if pos < 0:
raise exception.NotFoundError("post")
html = text.extract(page, "<article ", "</article>", pos-200)[0]

self.kwdict["thread"] = self._parse_thread(page)
return (self._parse_post(html),)


class NudostarforumThreadExtractor(NudostarforumExtractor):
"""Extractor for threads on nudostar forum"""
subcategory = "thread"
pattern = rf"{BASE_PATTERN}(/threads/[^/?#]+\.(\d+))(?:/page-(\d+))?"
example = "https://nudostar.com/forum/threads/NAME.12345/"

def posts(self):
path, thread_id, pnum = self.groups

for page in self._pagination(path, pnum):
if "thread" not in self.kwdict:
self.kwdict["thread"] = self._parse_thread(page)

for html in text.extract_iter(page, "<article ", "</article>"):
yield self._parse_post(html)
29 changes: 29 additions & 0 deletions test/results/nudostarforum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# -*- coding: utf-8 -*-

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

from gallery_dl.extractor import nudostarforum


__tests__ = (
{
"#url" : "https://nudostar.com/forum/threads/aspen-rae.106714/",
"#category": ("", "nudostarforum", "thread"),
"#class" : nudostarforum.NudostarforumThreadExtractor,
},

{
"#url" : "https://nudostar.com/forum/threads/aspen-rae.106714/page-2",
"#category": ("", "nudostarforum", "thread"),
"#class" : nudostarforum.NudostarforumThreadExtractor,
},

{
"#url" : "https://nudostar.com/forum/threads/name.12345/post-67890",
"#category": ("", "nudostarforum", "post"),
"#class" : nudostarforum.NudostarforumPostExtractor,
},

)