Skip to content

Commit 8352b14

Browse files
committed
fix: aplica transformação de URL no campo 'url' principal (#90)
O campo 'url' não estava sendo processado pela função _build_file_url(), resultando em URLs completas mesmo com a feature flag REPLACE_FILE_URL_BASE ativada. Apenas o campo 'txt_url' estava sendo transformado corretamente. Alterações: - gazettes/gazette_access.py: aplica _build_file_url() ao campo 'url' - themed_excerpts/themed_excerpt_access.py: aplica _build_file_url() ao campo 'source_url' Testes adicionados: - tests/test_gazette_file_url_builder.py: 13 testes para validar transformação de URLs em gazettes - tests/test_themed_excerpt_file_url_builder.py: 8 testes para validar transformação de URLs em themed excerpts Os testes cobrem todos os cenários: 1. Caminhos relativos (dados novos) 2. URLs completas com substituição de base (migração) 3. URLs completas sem alteração (modo legado) 4. Edge cases (trailing slashes, protocolos diferentes, etc) Closes #XX
1 parent 948343e commit 8352b14

7 files changed

Lines changed: 958 additions & 134 deletions

File tree

gazettes/gazette_access.py

Lines changed: 6 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
PaginationMixin,
1616
HighlightMixin,
1717
)
18+
from utils import build_file_url
1819

1920

2021
class GazetteRequest:
@@ -335,71 +336,6 @@ def get_gazettes(
335336
def get_total_number_items(self, search_response_json: Dict):
336337
return search_response_json["hits"]["total"]["value"]
337338

338-
def _build_file_url(self, path_or_url: str) -> str:
339-
"""
340-
Builds the complete file URL from a relative path or processes legacy URLs.
341-
342-
This method supports three scenarios:
343-
1. New data: relative paths (e.g., "3304557/2019/file.txt")
344-
2. Old data: full URLs with automatic base URL replacement
345-
3. Legacy mode: full URLs returned as-is (backward compatibility)
346-
347-
Environment variables:
348-
- QUERIDO_DIARIO_FILES_ENDPOINT: New base URL for files
349-
- REPLACE_FILE_URL_BASE: Boolean flag to enable base URL replacement (true/false)
350-
351-
Examples:
352-
- If REPLACE_FILE_URL_BASE=true and
353-
QUERIDO_DIARIO_FILES_ENDPOINT="https://cdn.queridodiario.ok.org.br"
354-
Then "https://queridodiario.nyc3.digitaloceanspaces.com/3304557/2019/file.txt"
355-
becomes "https://cdn.queridodiario.ok.org.br/3304557/2019/file.txt"
356-
357-
Args:
358-
path_or_url: Either a relative path or a full URL
359-
360-
Returns:
361-
Complete URL to access the file
362-
"""
363-
import os
364-
import re
365-
366-
endpoint = os.environ.get("QUERIDO_DIARIO_FILES_ENDPOINT", "")
367-
replace_base_enabled = (
368-
os.environ.get("REPLACE_FILE_URL_BASE", "false").lower() == "true"
369-
)
370-
371-
# Check if it's a URL (supports http://, https://, s3://)
372-
is_url = (
373-
path_or_url.startswith("http://")
374-
or path_or_url.startswith("https://")
375-
or path_or_url.startswith("s3://")
376-
)
377-
378-
# Scenario 1: Relative path (new data)
379-
if not is_url:
380-
if not endpoint:
381-
return path_or_url # No endpoint configured
382-
383-
endpoint = endpoint.rstrip("/")
384-
path = path_or_url.lstrip("/")
385-
return f"{endpoint}/{path}"
386-
387-
# Scenario 2: Full URL with base replacement enabled
388-
if replace_base_enabled and endpoint:
389-
# Extract path from URL using regex
390-
# Pattern: <protocol>://<domain>/<path>
391-
# Supports: http://, https://, s3://
392-
pattern = r"^(https?://|s3://)[^/]+/(.+)$"
393-
match = re.match(pattern, path_or_url)
394-
395-
if match:
396-
relative_path = match.group(2)
397-
endpoint_clean = endpoint.rstrip("/")
398-
return f"{endpoint_clean}/{relative_path}"
399-
400-
# Scenario 3: Legacy mode - return URL as-is
401-
return path_or_url
402-
403339
def create_list_with_gazette_objects(self, gazette_hits: List[Dict]):
404340
return [self._assemble_gazette_object(gazette) for gazette in gazette_hits]
405341

@@ -411,14 +347,17 @@ def _assemble_gazette_object(self, gazette):
411347
)
412348

413349
# Build file URL from relative path or process legacy URL
350+
file_url = gazette["_source"]["url"]
351+
url = build_file_url(file_url)
352+
414353
file_raw_txt = gazette["_source"].get("file_raw_txt", None)
415-
txt_url = self._build_file_url(file_raw_txt) if file_raw_txt else None
354+
txt_url = build_file_url(file_raw_txt) if file_raw_txt else None
416355

417356
return GazetteSearchResult(
418357
gazette["_source"]["territory_id"],
419358
datetime.strptime(gazette["_source"]["date"], "%Y-%m-%d").date(),
420359
datetime.fromisoformat(gazette["_source"]["scraped_at"]),
421-
gazette["_source"]["url"],
360+
url,
422361
gazette["_source"]["file_checksum"],
423362
gazette["_source"]["territory_name"],
424363
gazette["_source"]["state_code"],

0 commit comments

Comments
 (0)