Skip to content
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
d322e06
Add window-fetching feature
Shohail-Ismail Aug 25, 2025
0da5299
Add 5 year backfill process using data retrieval windows
Shohail-Ismail Aug 25, 2025
c2a937c
Add main function
Shohail-Ismail Aug 25, 2025
14be2e0
Add tests
Shohail-Ismail Aug 25, 2025
0baf6a6
Minor cosmetic changes
Shohail-Ismail Aug 25, 2025
7940aad
Add missing import
Shohail-Ismail Aug 26, 2025
25a26db
Fix imports to be cleaner
Shohail-Ismail Aug 26, 2025
d828798
Fix CI-related error
Shohail-Ismail Aug 26, 2025
f00ee15
Fix CI error not showing up in pytest
Shohail-Ismail Aug 26, 2025
f5dc2bc
Fix PSR code check for solar rows
Shohail-Ismail Aug 26, 2025
ca02289
Move solar export code into separatede_export.py script
Shohail-Ismail Sep 15, 2025
fa02153
Move solar export code into separate 'de_export.py' script
Shohail-Ismail Sep 15, 2025
dc45a28
Merge branch 'german-solar-csv' of https://github.com/Shohail-Ismail/…
Shohail-Ismail Sep 15, 2025
e5e5e90
Make XML more accurate to new ENTSOE API docs
Shohail-Ismail Sep 15, 2025
adc564c
Correct tests to align with new XML, and fix HTTP error path in inval…
Shohail-Ismail Sep 15, 2025
43fd709
Remove 361K line backfilled-CSV
Shohail-Ismail Sep 15, 2025
6fdefd6
Correct Ruff and Black errors
Shohail-Ismail Sep 15, 2025
7d7a46f
Delete solar_consumer/exports/de_5_year_repopulate.csv
Shohail-Ismail Sep 15, 2025
6f02f2b
Moved de_export.py to \scripts
Shohail-Ismail Sep 15, 2025
5970e2c
Add ENTSOE_API_KEY to example.env + fix formatting
Shohail-Ismail Nov 3, 2025
86c9551
Add entsoe-py as dependency + migrate DE solar data fetching to use i…
Shohail-Ismail Jan 20, 2026
d923a9f
Refactor tests to mock EntsoePandasClient, and add 2 new tests accord…
Shohail-Ismail Jan 20, 2026
95c5b3b
Clean up code/comments
Shohail-Ismail Jan 20, 2026
c4abc62
Merge branch 'main' into german-solar-csv
Shohail-Ismail Jan 20, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 152 additions & 7 deletions solar_consumer/data/fetch_de_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,135 @@
# Load environment variables
dotenv.load_dotenv()

# Prepare request
URL = "https://web-api.tp.entsoe.eu/api" # base URL for api
API_KEY = os.getenv("ENTSOE_API_KEY", "") # api key from env vars, empty string if missing
SOLAR_PSR_CODES = {"B16", "A-10Y1001A1001A83H"} # accept both real-world and test solar codes


def _fetch_de_window(start: datetime, end: datetime) -> pd.DataFrame:
"""
Fetch German solar gen data from ENTSOE for specific time window (UTC) (>24H FETCH)

It should be noted there are no resampling or filling gaps.
Returns DataFrame with columns:
- target_datetime_utc (UTC)
- solar_generation_kw (kW)
- tso_zone
"""

if not API_KEY:
raise RuntimeError("WARNING: ENTSOE_API_KEY not set in environment")

# Sanity check for date request
assert start < end, "Start date must be before end"
period_start = start.strftime("%Y%m%d%H%M")
period_end = end.strftime("%Y%m%d%H%M")

params = {
"documentType": "A75",
"processType": "A16",
"in_Domain": "10Y1001A1001A83F",
"psrType": "B16",
"periodStart": period_start,
"periodEnd": period_end,
"securityToken": API_KEY,
}

session = requests.Session()
response = session.get(URL, params= params)
try:
response.raise_for_status()
except Exception as e:
logger.error("API request failed- {}: {}", response.status_code, e)
raise

# Parse XML response
root = ET.fromstring(response.content)
records = []

# For each TimeSeries (ts), extract the TSO and only solar PSR
for ts in root.findall(".//TimeSeries"):
zone = ts.findtext(".//inBiddingZone_Domain/Mrid")
psr = ts.findtext(".//MktPSRType/psrType")
if psr not in SOLAR_PSR_CODES:
continue

# Get each timestamped value in each ts
for pt in ts.findall(".//Period/Point"):
start_str = pt.findtext("timeInterval/start")
qty_str = pt.findtext("quantity")
try:
qty = float(qty_str)
except (TypeError, ValueError):
logger.error("Skipping malformed entry in response: ({}) in zone {}", qty_str, zone)
continue

dt = pd.to_datetime(start_str, utc=True)

records.append(
{
"target_datetime_utc": dt,
"solar_generation_kw": qty * 1000.0,
"tso_zone": zone,
}
)

# Create time-ordered dataframe and return completed window
# (multi-window calls will be concatenated by caller)
df = pd.DataFrame(records)
if not df.empty:
df = df.sort_values("target_datetime_utc").reset_index(drop = True)
return df


def fetch_de_data_range(start: datetime, end: datetime, chunk_hours: int = 168) -> pd.DataFrame:
"""
Fetch German solar generation over a date range by chunking into windows (smaller payloads for API
and more robust 'retry" options)
- start/end: inclusive start/exclusive end datetime (UTC expected)
- chunk_hours: window size (default 168h = 7 days) to keep payloads reasonable.
Returns a DataFrame with the same schema as _fetch_de_window
"""
assert start < end, "Start date must be before end"

# Normalise to UTC and hour boundaries
if start.tzinfo is None:
start = start.replace(tzinfo = timezone.utc)
if end.tzinfo is None:
end = end.replace(tzinfo = timezone.utc)
start = start.replace(minute = 0, second = 0, microsecond = 0)
end = end.replace(minute = 0, second = 0, microsecond = 0)

# Accumulate windows to concat at end
frames = []
window = start
step = timedelta(hours = chunk_hours)

# Fetch one window (network and XML parse) from start to end, and store non-empty results
while window < end:
nxt = min(window + step, end)
df_chunk = _fetch_de_window(window, nxt)
if not df_chunk.empty:
frames.append(df_chunk)
window = nxt

# If all windows are completly empty, return empty with right shape
if not frames:
return pd.DataFrame(columns=["target_datetime_utc", "solar_generation_kw", "tso_zone"])

# Concatenate to a single table
df = pd.concat(frames, ignore_index = True)
df = (df.drop_duplicates(subset=["target_datetime_utc", "tso_zone"]).sort_values("target_datetime_utc")
.reset_index(drop=True))
logger.info("Assembled {} rows of German solar data over range.", len(df))
return df


def fetch_de_data(historic_or_forecast: str = "generation") -> pd.DataFrame:
"""
Fetch solar generation data from German bidding zones via the
ENTSOE API
ENTSOE API (24 HOUR FETCH)

Only 'generation' mode is supported for now

Expand All @@ -30,9 +155,6 @@ def fetch_de_data(historic_or_forecast: str = "generation") -> pd.DataFrame:
period_start = start.strftime("%Y%m%d%H%M")
period_end = now.strftime("%Y%m%d%H%M")

# Prepare request
url = "https://web-api.tp.entsoe.eu/api" # base url for api
API_KEY = os.getenv("ENTSOE_API_KEY", "") # api key from env vars, empty string if missing
params = {
"documentType": "A75", # actual generation
"processType": "A16", # realised output
Expand All @@ -46,7 +168,7 @@ def fetch_de_data(historic_or_forecast: str = "generation") -> pd.DataFrame:
# Initialise session for request
session = requests.Session()
logger.debug("Requesting German data from API with params: {}", params)
response = session.get(url, params=params)
response = session.get(URL, params=params)
try:
response.raise_for_status()
except Exception as e:
Expand All @@ -62,7 +184,7 @@ def fetch_de_data(historic_or_forecast: str = "generation") -> pd.DataFrame:
for ts in root.findall(".//TimeSeries"):
zone = ts.findtext(".//inBiddingZone_Domain/Mrid")
psr = ts.findtext(".//MktPSRType/psrType")
if psr != "A-10Y1001A1001A83H": # Skips all non-solar data
if psr not in SOLAR_PSR_CODES: # Skips all non-solar data
continue

for pt in ts.findall(".//Period/Point"):
Expand All @@ -71,7 +193,7 @@ def fetch_de_data(historic_or_forecast: str = "generation") -> pd.DataFrame:
try:
qty = float(qty_str)
except (TypeError, ValueError):
logger.warning("Skipping malfromed quantity (%s) in zone %s", qty_str, zone)
logger.error("Skipping malformed quantity ({}) in zone {}", qty_str, zone)
continue

# Convert and record in list
Expand All @@ -90,3 +212,26 @@ def fetch_de_data(historic_or_forecast: str = "generation") -> pd.DataFrame:
logger.info("Assembled {} rows of German solar data", len(df))

return df


if __name__ == "__main__":
# Backfill last 5 years - yesterday and write to CSV
output_dir = os.path.join("data", "de_solar")
os.makedirs(output_dir, exist_ok = True)
out_path = os.path.join(output_dir, "germany_solar_generation.csv")

now_utc = datetime.now(timezone.utc).replace(minute = 0, second = 0, microsecond = 0)
end = (now_utc - timedelta(days=1)).replace(minute = 0, second = 0, microsecond = 0)

# Start at first day of the month 5 years ago for clean boundaries
past_five_years = end - timedelta(days = 5 * 365)
start = past_five_years.replace(day = 1, hour = 0, minute = 0, second = 0, microsecond = 0)

# Perform backfill using week-long chunks as stated before
df = fetch_de_data_range(start, end, chunk_hours = 168) ### Adjust if you hit API limits ###

# Write to file (done with temp to avoid partial files)
temp = out_path + ".tmp"
df.to_csv(temp, index = False)
os.replace(temp, out_path)
logger.info("FINISHED: WROTE {} ROWS OF SOLAR GENERATION DATA TO FILE: {}", len(df), out_path)
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import pytest
import requests
import pandas as pd
from solar_consumer.data.fetch_de_data import fetch_de_data
from solar_consumer.data.fetch_de_data import fetch_de_data, fetch_de_data_range
import solar_consumer.data.fetch_de_data as de_module

# Combined XML fixture: includes wind offshore (B18), wind onshore (B19)
# and solar (A-10Y1001A1001A83H), as shown in ENTSOE API docs
Expand Down Expand Up @@ -65,6 +66,12 @@ def dummy_get(self, url, params=None):
return DummyResp()
monkeypatch.setattr(requests.Session, "get", dummy_get)
yield

@pytest.fixture(autouse=True)
def _set_entsoe_key(monkeypatch):
#Make sure code under test sees non-empty API key
monkeypatch.setenv("ENTSOE_API_KEY", "dummy")
monkeypatch.setattr(de_module, "API_KEY", "dummy", raising=False)

def test_only_solar_rows_returned():
df = fetch_de_data()
Expand All @@ -82,14 +89,34 @@ def test_assert_on_invalid_mode():
with pytest.raises(AssertionError):
fetch_de_data(historic_or_forecast = 'forecast')


def test_http_error(monkeypatch):
class BadResp(DummyResp):
def __init__(self):
super().__init__(status_code=500)
monkeypatch.setattr(requests.Session, 'get', lambda self, url, params=None: BadResp())
with pytest.raises(requests.HTTPError):
fetch_de_data()

def test_range_fetch_returns_rows():
# 2-hour window spanning the 2 sample points in SAMPLE_XML
start = pd.Timestamp("2025-07-11T02:00Z")
end = pd.Timestamp("2025-07-11T04:00Z")
df = fetch_de_data_range(start.to_pydatetime(), end.to_pydatetime(), chunk_hours=1)
assert not df.empty
assert {"target_datetime_utc", "solar_generation_kw", "tso_zone"} <= set(df.columns)

#should be 2 points - both solar and zone as in the fixture
assert df.shape == (2, 3) and all(df["tso_zone"] == "TEST_ZONE")

def test_range_fetch_handles_empty_windows():
# Time outside sample XML gives mocked response, but this ensures the function doesn't
# error and returns expected columns when empty
start = pd.Timestamp("1999-01-01T00:00Z")
end = pd.Timestamp("1999-01-01T01:00Z")
df = fetch_de_data_range(start.to_pydatetime(), end.to_pydatetime(), chunk_hours=1)
assert isinstance(df, pd.DataFrame)
assert {"target_datetime_utc", "solar_generation_kw", "tso_zone"} <= set(df.columns)


# Live test only executes if $env ENT­SOE_API_KEY set
@pytest.mark.skip(reason = "Live ENTSOE endpoint often returns empty rows for the most recent 24h;\
Expand Down
Loading