Skip to content

Commit 44582d8

Browse files
committed
feat: add gzip decompression for in-memory dataset loading
add universal gzip decompression that works in both browser and node.js: - browser: uses native decompressionstream api (chrome 80+, firefox 113+) - node.js: uses built-in zlib module (all versions) loadBenchmarkFromUrl now auto-detects .gz urls and decompresses in memory, enabling direct loading from sources like snap without local file download. new exports: - decompressGzip: decompress uint8array to string - fetchAndDecompressGzip: fetch and decompress in one step - fetchWithAutoDecompress: auto-detect compression from url - isGzipUrl: check if url ends with .gz/.gzip
1 parent 954f659 commit 44582d8

File tree

5 files changed

+298
-11
lines changed

5 files changed

+298
-11
lines changed

src/experiments/evaluation/fixtures/benchmark-datasets.ts

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ import { dirname,resolve } from "node:path";
1313
import { fileURLToPath } from "node:url";
1414

1515
import { Graph } from "../../../algorithms/graph/graph";
16-
import { type LoadedEdge,loadEdgeList, type LoadedNode } from "../loaders/index";
16+
import { fetchWithAutoDecompress, type LoadedEdge,loadEdgeList, type LoadedNode } from "../loaders/index";
1717

1818
// ============================================================================
1919
// Types
@@ -296,30 +296,32 @@ export const loadAllBenchmarks = async (benchmarksRoot?: string): Promise<Map<st
296296
/**
297297
* Load a benchmark dataset from a URL.
298298
*
299-
* This function works in both browser and Node.js environments using fetch.
300-
* Use this when you have a direct URL to the edge list file.
299+
* This function works in both browser and Node.js environments.
300+
* Automatically handles gzip-compressed files (.gz extension).
301301
*
302-
* @param url - URL to the edge list file
302+
* @param url - URL to the edge list file (can be .txt or .txt.gz)
303303
* @param meta - Dataset metadata (for parsing configuration)
304304
* @returns Loaded benchmark with graph and metadata
305305
* @throws Error if fetch fails or parsing fails
306306
*
307307
* @example
308308
* ```typescript
309+
* // Plain text file
309310
* const benchmark = await loadBenchmarkFromUrl(
310311
* 'https://raw.githubusercontent.com/user/repo/main/data/karate.edges',
311312
* KARATE
312313
* );
314+
*
315+
* // Gzip-compressed file (automatically decompressed)
316+
* const compressed = await loadBenchmarkFromUrl(
317+
* 'https://snap.stanford.edu/data/facebook_combined.txt.gz',
318+
* FACEBOOK
319+
* );
313320
* ```
314321
*/
315322
export const loadBenchmarkFromUrl = async (url: string, meta: BenchmarkDatasetMeta): Promise<LoadedBenchmark> => {
316-
const response = await fetch(url);
317-
318-
if (!response.ok) {
319-
throw new Error(`Failed to fetch ${url}: ${response.status} ${response.statusText}`);
320-
}
321-
322-
const content = await response.text();
323+
// Auto-detect and handle gzip compression
324+
const content = await fetchWithAutoDecompress(url);
323325

324326
const result = loadEdgeList(content, {
325327
directed: meta.directed,

src/experiments/evaluation/index.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,12 @@ export {
8686

8787
// Export graph loaders
8888
export {
89+
// Decompression utilities
90+
decompressGzip,
8991
type EdgeListConfig,
92+
fetchAndDecompressGzip,
93+
fetchWithAutoDecompress,
94+
isGzipUrl,
9095
type LoadedEdge,
9196
loadEdgeList,
9297
type LoadedNode,
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
/**
2+
* Universal decompression utilities for browser and Node.js environments.
3+
*
4+
* Supports gzip decompression using:
5+
* - Node.js: Built-in `zlib` module (works in all Node versions)
6+
* - Browser: Native `DecompressionStream` API (Chrome 80+, Firefox 113+, Safari 16.4+)
7+
*
8+
* No external dependencies required.
9+
*/
10+
11+
/**
12+
* Check if running in Node.js environment.
13+
*/
14+
const isNode = (): boolean =>
15+
typeof process !== "undefined" &&
16+
process.versions?.node != undefined;
17+
18+
/**
19+
* Check if DecompressionStream is available (modern browsers and Node 18+).
20+
*/
21+
const hasDecompressionStream = (): boolean =>
22+
typeof DecompressionStream !== "undefined";
23+
24+
/**
25+
* Decompress gzip data using Node.js zlib module.
26+
*
27+
* @param data - Compressed data as Uint8Array
28+
* @returns Decompressed string
29+
*/
30+
const decompressWithZlib = async (data: Uint8Array): Promise<string> => {
31+
// Dynamic import to avoid bundling issues in browser builds
32+
const { gunzip } = await import("node:zlib");
33+
const { promisify } = await import("node:util");
34+
35+
const gunzipAsync = promisify(gunzip);
36+
const decompressed = await gunzipAsync(Buffer.from(data));
37+
return decompressed.toString("utf-8");
38+
};
39+
40+
/**
41+
* Decompress gzip data using browser DecompressionStream API.
42+
*
43+
* @param data - Compressed data as Uint8Array
44+
* @returns Decompressed string
45+
*/
46+
const decompressWithStream = async (data: Uint8Array): Promise<string> => {
47+
// Create a new ArrayBuffer copy to ensure compatibility
48+
const buffer = new ArrayBuffer(data.byteLength);
49+
new Uint8Array(buffer).set(data);
50+
const blob = new Blob([buffer]);
51+
const stream = blob.stream();
52+
const decompressedStream = stream.pipeThrough(new DecompressionStream("gzip"));
53+
const response = new Response(decompressedStream);
54+
return response.text();
55+
};
56+
57+
/**
58+
* Decompress gzip-compressed data to a string.
59+
*
60+
* Automatically uses the best available method for the current environment:
61+
* - Node.js: Uses built-in zlib module
62+
* - Browser: Uses native DecompressionStream API
63+
*
64+
* @param data - Gzip-compressed data as Uint8Array
65+
* @returns Promise resolving to decompressed UTF-8 string
66+
* @throws Error if decompression fails or no decompression method is available
67+
*
68+
* @example
69+
* ```typescript
70+
* const response = await fetch('https://example.com/data.gz');
71+
* const compressed = new Uint8Array(await response.arrayBuffer());
72+
* const text = await decompressGzip(compressed);
73+
* ```
74+
*/
75+
export const decompressGzip = async (data: Uint8Array): Promise<string> => {
76+
if (isNode()) {
77+
return decompressWithZlib(data);
78+
}
79+
80+
if (hasDecompressionStream()) {
81+
return decompressWithStream(data);
82+
}
83+
84+
throw new Error(
85+
"No decompression method available. " +
86+
"Browser requires DecompressionStream API (Chrome 80+, Firefox 113+, Safari 16.4+). " +
87+
"Consider using a polyfill like 'pako' for older browsers."
88+
);
89+
};
90+
91+
/**
92+
* Fetch and decompress gzip content from a URL.
93+
*
94+
* @param url - URL to fetch gzip-compressed content from
95+
* @returns Promise resolving to decompressed UTF-8 string
96+
* @throws Error if fetch fails or decompression fails
97+
*
98+
* @example
99+
* ```typescript
100+
* const text = await fetchAndDecompressGzip('https://snap.stanford.edu/data/facebook_combined.txt.gz');
101+
* ```
102+
*/
103+
export const fetchAndDecompressGzip = async (url: string): Promise<string> => {
104+
const response = await fetch(url);
105+
106+
if (!response.ok) {
107+
throw new Error(`Failed to fetch ${url}: ${response.status} ${response.statusText}`);
108+
}
109+
110+
const arrayBuffer = await response.arrayBuffer();
111+
const data = new Uint8Array(arrayBuffer);
112+
113+
return decompressGzip(data);
114+
};
115+
116+
/**
117+
* Detect if a URL points to a gzip-compressed file based on extension.
118+
*
119+
* @param url - URL to check
120+
* @returns True if URL ends with .gz or .gzip
121+
*/
122+
export const isGzipUrl = (url: string): boolean => {
123+
const lowered = url.toLowerCase();
124+
return lowered.endsWith(".gz") || lowered.endsWith(".gzip");
125+
};
126+
127+
/**
128+
* Fetch content from URL, automatically decompressing if gzip.
129+
*
130+
* Detects compression based on:
131+
* 1. URL extension (.gz, .gzip)
132+
* 2. Content-Encoding header
133+
*
134+
* @param url - URL to fetch content from
135+
* @returns Promise resolving to text content (decompressed if needed)
136+
*
137+
* @example
138+
* ```typescript
139+
* // Automatically handles both compressed and uncompressed URLs
140+
* const text1 = await fetchWithAutoDecompress('https://example.com/data.txt');
141+
* const text2 = await fetchWithAutoDecompress('https://example.com/data.txt.gz');
142+
* ```
143+
*/
144+
export const fetchWithAutoDecompress = async (url: string): Promise<string> => {
145+
// If URL indicates gzip, fetch as binary and decompress
146+
if (isGzipUrl(url)) {
147+
return fetchAndDecompressGzip(url);
148+
}
149+
150+
// Otherwise, fetch as text (browser/fetch handles Content-Encoding automatically)
151+
const response = await fetch(url);
152+
153+
if (!response.ok) {
154+
throw new Error(`Failed to fetch ${url}: ${response.status} ${response.statusText}`);
155+
}
156+
157+
return response.text();
158+
};
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
/**
2+
* Unit tests for decompression utilities
3+
*/
4+
5+
import { gzipSync } from "node:zlib";
6+
7+
import { describe, expect, it } from "vitest";
8+
9+
import { decompressGzip, fetchWithAutoDecompress, isGzipUrl } from "./decompress";
10+
11+
describe("Decompression Utilities", () => {
12+
describe("isGzipUrl", () => {
13+
it("should detect .gz extension", () => {
14+
expect(isGzipUrl("https://example.com/data.txt.gz")).toBe(true);
15+
expect(isGzipUrl("https://example.com/data.gz")).toBe(true);
16+
expect(isGzipUrl("/path/to/file.gz")).toBe(true);
17+
});
18+
19+
it("should detect .gzip extension", () => {
20+
expect(isGzipUrl("https://example.com/data.gzip")).toBe(true);
21+
expect(isGzipUrl("/path/to/file.gzip")).toBe(true);
22+
});
23+
24+
it("should be case insensitive", () => {
25+
expect(isGzipUrl("https://example.com/data.GZ")).toBe(true);
26+
expect(isGzipUrl("https://example.com/data.GZIP")).toBe(true);
27+
expect(isGzipUrl("https://example.com/data.Gz")).toBe(true);
28+
});
29+
30+
it("should return false for non-gzip URLs", () => {
31+
expect(isGzipUrl("https://example.com/data.txt")).toBe(false);
32+
expect(isGzipUrl("https://example.com/data.zip")).toBe(false);
33+
expect(isGzipUrl("https://example.com/data.tar")).toBe(false);
34+
expect(isGzipUrl("https://example.com/data")).toBe(false);
35+
});
36+
});
37+
38+
describe("decompressGzip", () => {
39+
it("should decompress gzip data", async () => {
40+
const originalText = "Hello, World! This is a test.";
41+
const compressed = gzipSync(Buffer.from(originalText));
42+
const data = new Uint8Array(compressed);
43+
44+
const result = await decompressGzip(data);
45+
46+
expect(result).toBe(originalText);
47+
});
48+
49+
it("should handle multi-line content", async () => {
50+
const originalText = "line1\nline2\nline3";
51+
const compressed = gzipSync(Buffer.from(originalText));
52+
const data = new Uint8Array(compressed);
53+
54+
const result = await decompressGzip(data);
55+
56+
expect(result).toBe(originalText);
57+
});
58+
59+
it("should handle edge list format", async () => {
60+
const edgeList = "1 2\n2 3\n3 4\n4 1";
61+
const compressed = gzipSync(Buffer.from(edgeList));
62+
const data = new Uint8Array(compressed);
63+
64+
const result = await decompressGzip(data);
65+
66+
expect(result).toBe(edgeList);
67+
expect(result.split("\n")).toHaveLength(4);
68+
});
69+
70+
it("should handle unicode content", async () => {
71+
// Test with accented characters and CJK characters (no emoji per lint rules)
72+
const unicodeText = "Hello World! \u65E5\u672C\u8A9E";
73+
const compressed = gzipSync(Buffer.from(unicodeText));
74+
const data = new Uint8Array(compressed);
75+
76+
const result = await decompressGzip(data);
77+
78+
expect(result).toBe(unicodeText);
79+
});
80+
81+
it("should handle empty content", async () => {
82+
const emptyText = "";
83+
const compressed = gzipSync(Buffer.from(emptyText));
84+
const data = new Uint8Array(compressed);
85+
86+
const result = await decompressGzip(data);
87+
88+
expect(result).toBe(emptyText);
89+
});
90+
91+
it("should handle large content", async () => {
92+
// Create a moderately large edge list (10k edges)
93+
const lines: string[] = [];
94+
for (let index = 0; index < 10_000; index++) {
95+
lines.push(`${index} ${index + 1}`);
96+
}
97+
const largeText = lines.join("\n");
98+
const compressed = gzipSync(Buffer.from(largeText));
99+
const data = new Uint8Array(compressed);
100+
101+
const result = await decompressGzip(data);
102+
103+
expect(result).toBe(largeText);
104+
expect(result.split("\n")).toHaveLength(10_000);
105+
});
106+
});
107+
108+
describe("fetchWithAutoDecompress", () => {
109+
it("should export the function", () => {
110+
expect(typeof fetchWithAutoDecompress).toBe("function");
111+
});
112+
113+
// Network tests would require mocking fetch or using a test server
114+
// These are integration tests that verify the function exists and has the right signature
115+
});
116+
});

src/experiments/evaluation/loaders/index.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,12 @@
44
* Support for loading benchmark datasets in various formats.
55
*/
66

7+
export {
8+
decompressGzip,
9+
fetchAndDecompressGzip,
10+
fetchWithAutoDecompress,
11+
isGzipUrl,
12+
} from "./decompress";
713
export {
814
type EdgeListConfig,
915
type LoadedEdge,

0 commit comments

Comments
 (0)