Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 113 additions & 0 deletions src/lib/url-cleaner.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
/**
* URL Cleaner - Removes tracking parameters from URLs
*
* This module provides functionality to clean URLs by removing
* common tracking parameters (e.g., spm, utm_*, etc.)
*/

/**
* Default list of tracking parameters to remove from URLs.
* These are common tracking parameters used by various platforms.
*/
export const DEFAULT_TRACKING_PARAMS: string[] = [
// Alibaba/Aliyun tracking
'spm',
'scm',
// DingTalk tracking
'_dt_ac',
'_dt_sig',
'_dt_ts',
// Google Analytics UTM parameters
'utm_source',
'utm_medium',
'utm_campaign',
'utm_term',
'utm_content',
// Facebook
'fbclid',
// Google Ads
'gclid',
'gclsrc',
// Microsoft/Bing
'msclkid',
// Twitter
'twclid',
// TikTok
'ttclid',
// Other common tracking params
'_ga',
'_gl',
'mc_cid',
'mc_eid',
];

/**
* Removes specified tracking parameters from a URL.
*
* @param url - The URL to clean
* @param paramsToRemove - List of parameter names to remove (case-insensitive)
* @returns The cleaned URL, or the original URL if parsing fails
*
* @example
* ```typescript
* cleanUrl('https://example.com?spm=123&id=456')
* // Returns: 'https://example.com?id=456'
*
* cleanUrl('https://example.com?utm_source=google&page=1', ['utm_source'])
* // Returns: 'https://example.com?page=1'
* ```
*/
export function cleanUrl(
url: string,
paramsToRemove: string[] = DEFAULT_TRACKING_PARAMS,
): string {
// Return original URL if it's empty or invalid
if (!url || typeof url !== 'string') {
return url;
}

try {
const urlObj = new URL(url);

// Create a set of lowercase param names for case-insensitive matching
const paramsToRemoveSet = new Set(
paramsToRemove.map(p => p.toLowerCase()),
);

// Collect params to delete (can't modify while iterating)
const keysToDelete: string[] = [];
urlObj.searchParams.forEach((_, key) => {
if (paramsToRemoveSet.has(key.toLowerCase())) {
keysToDelete.push(key);
}
});

// Delete the tracking params
keysToDelete.forEach(key => urlObj.searchParams.delete(key));

return urlObj.toString();
}
catch {
// If URL parsing fails, return the original URL unchanged
return url;
}
}

/**
* Creates a URL cleaner function with pre-configured parameters.
*
* @param paramsToRemove - List of parameter names to remove
* @returns A function that cleans URLs using the specified parameters
*
* @example
* ```typescript
* const cleaner = createUrlCleaner(['spm', 'utm_source']);
* cleaner('https://example.com?spm=123&id=456');
* // Returns: 'https://example.com?id=456'
* ```
*/
export function createUrlCleaner(
paramsToRemove: string[] = DEFAULT_TRACKING_PARAMS,
): (url: string) => string {
return (url: string) => cleanUrl(url, paramsToRemove);
}
8 changes: 6 additions & 2 deletions src/services/link-export-service.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import type { CustomFormatsProvider, MarkdownFormatter } from './shared-types.js';
import { cleanUrl } from '../lib/url-cleaner.js';

// Type Definitions
export type LinkExportFormat = 'link' | 'custom-format';
Expand Down Expand Up @@ -56,16 +57,19 @@ export class LinkExportService {
// Validate options
validateLinkExportOptions(options);

// Clean URL to remove tracking parameters
const cleanedUrl = cleanUrl(options.url);

// Route to appropriate formatter
switch (options.format) {
case 'link':
return this.markdown.linkTo(options.title, options.url);
return this.markdown.linkTo(options.title, cleanedUrl);

case 'custom-format':
// We already validated that customFormatSlot exists
return renderCustomFormatLink(
options.title,
options.url,
cleanedUrl,
options.customFormatSlot!,
// TODO: implement flexible title formatter.
// See https://github.com/yorkxin/copy-as-markdown/issues/133
Expand Down
6 changes: 5 additions & 1 deletion src/services/tab-export-service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { Tab, TabGroup, TabListGrouper } from '../lib/tabs.js';
import CustomFormatClass from '../lib/custom-format.js';
import type { CustomFormatsProvider, MarkdownFormatter } from './shared-types.js';
import { createBrowserTabDataFetcher } from './browser-tab-data-fetcher.js';
import { cleanUrl } from '../lib/url-cleaner.js';

export type ExportFormat = 'link' | 'title' | 'url' | 'custom-format';
export type ListType = 'list' | 'task-list';
Expand Down Expand Up @@ -35,10 +36,11 @@ export function validateOptions(options: ExportTabsOptions): void {
export function convertBrowserTabsToTabs(
browserTabs: chrome.tabs.Tab[],
escapeLinkText: (text: string) => string,
urlCleaner: (url: string) => string = url => url,
): Tab[] {
return browserTabs.map(tab => new Tab(
escapeLinkText(tab.title || ''),
tab.url || '',
urlCleaner(tab.url || ''),
tab.groupId || TabGroup.NonGroupId,
));
}
Expand Down Expand Up @@ -193,9 +195,11 @@ export class TabExportService {
const browserGroups = await this.tabDataFetcher.fetchTabGroups(options.windowId);

// Convert and process (pure functions)
// Clean URLs to remove tracking parameters
const tabs = convertBrowserTabsToTabs(
browserTabs,
text => this.markdown.escapeLinkText(text),
cleanUrl,
);
const groups = convertBrowserTabGroups(browserGroups);
const tabLists = groupTabsIntoLists(tabs, groups);
Expand Down
8 changes: 5 additions & 3 deletions test/services/link-export-service.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,8 @@ describe('linkExportService', () => {
url: 'https://example.com',
});

expect(result).toBe('[Example](https://example.com)');
// Note: URL is normalized by cleanUrl (adds trailing slash)
expect(result).toBe('[Example](https://example.com/)');
expect(mockProvider.get).not.toHaveBeenCalled();
});

Expand Down Expand Up @@ -169,11 +170,12 @@ describe('linkExportService', () => {
customFormatSlot: '1',
});

expect(result).toBe('Custom: Test \\[Link\\] -> https://example.com');
// Note: URL is normalized by cleanUrl (adds trailing slash)
expect(result).toBe('Custom: Test \\[Link\\] -> https://example.com/');
expect(mockProvider.get).toHaveBeenCalledWith('single-link', '1');
expect(mockCustomFormat.render).toHaveBeenCalledWith({
title: 'Test \\[Link\\]',
url: 'https://example.com',
url: 'https://example.com/',
number: 1,
});
});
Expand Down
168 changes: 168 additions & 0 deletions test/url-cleaner.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
import { describe, expect, it } from 'vitest';
import { cleanUrl, createUrlCleaner, DEFAULT_TRACKING_PARAMS } from '../src/lib/url-cleaner.js';

describe('url-cleaner', () => {
describe('cleanUrl', () => {
describe('removes spm parameter', () => {
it('should remove spm from Aliyun URL', () => {
const url = 'https://www.aliyun.com/exp/llm?spm=5176.42028462.J_zsjI5GlfrwE0jnH6hsdfr.3.e939154ai51unt';
const expected = 'https://www.aliyun.com/exp/llm';
expect(cleanUrl(url)).toBe(expected);
});

it('should remove spm and keep other params from Alibaba URL', () => {
const url = 'https://fbi.alibaba-inc.com/dashboard/view/page.htm?spm=a2o1z.8190073.0.0.d7a0543flHuS4p&id=1392902';
const expected = 'https://fbi.alibaba-inc.com/dashboard/view/page.htm?id=1392902';
expect(cleanUrl(url)).toBe(expected);
});

it('should handle spm in the middle of params', () => {
const url = 'https://example.com?id=1&spm=abc123&page=2';
const expected = 'https://example.com/?id=1&page=2';
expect(cleanUrl(url)).toBe(expected);
});
});

describe('removes UTM parameters', () => {
it('should remove all UTM params', () => {
const url = 'https://example.com?utm_source=google&utm_medium=cpc&utm_campaign=test&id=123';
const expected = 'https://example.com/?id=123';
expect(cleanUrl(url)).toBe(expected);
});

it('should remove utm_term and utm_content', () => {
const url = 'https://example.com?utm_term=keyword&utm_content=banner&page=1';
const expected = 'https://example.com/?page=1';
expect(cleanUrl(url)).toBe(expected);
});
});

describe('removes other tracking parameters', () => {
it('should remove Facebook fbclid', () => {
const url = 'https://example.com?fbclid=IwAR3xyz&article=123';
const expected = 'https://example.com/?article=123';
expect(cleanUrl(url)).toBe(expected);
});

it('should remove Google gclid', () => {
const url = 'https://example.com?gclid=abc123&product=456';
const expected = 'https://example.com/?product=456';
expect(cleanUrl(url)).toBe(expected);
});

it('should remove Microsoft msclkid', () => {
const url = 'https://example.com?msclkid=xyz789&item=100';
const expected = 'https://example.com/?item=100';
expect(cleanUrl(url)).toBe(expected);
});

it('should remove scm parameter', () => {
const url = 'https://example.com?scm=abc&id=1';
const expected = 'https://example.com/?id=1';
expect(cleanUrl(url)).toBe(expected);
});
});

describe('case insensitivity', () => {
it('should remove SPM (uppercase)', () => {
const url = 'https://example.com?SPM=abc123&id=1';
const expected = 'https://example.com/?id=1';
expect(cleanUrl(url)).toBe(expected);
});

it('should remove UTM_SOURCE (uppercase)', () => {
const url = 'https://example.com?UTM_SOURCE=google&id=1';
const expected = 'https://example.com/?id=1';
expect(cleanUrl(url)).toBe(expected);
});
});

describe('edge cases', () => {
it('should return original URL if no tracking params', () => {
const url = 'https://example.com?id=123&page=2';
// Note: URL object normalizes the URL by adding a trailing slash before query
expect(cleanUrl(url)).toBe('https://example.com/?id=123&page=2');
});

it('should handle URL without query params', () => {
const url = 'https://example.com/page';
expect(cleanUrl(url)).toBe('https://example.com/page');
});

it('should handle empty string', () => {
expect(cleanUrl('')).toBe('');
});

it('should handle invalid URL gracefully', () => {
const invalidUrl = 'not-a-valid-url';
expect(cleanUrl(invalidUrl)).toBe(invalidUrl);
});

it('should handle URL with hash', () => {
const url = 'https://example.com?spm=abc#section';
const expected = 'https://example.com/#section';
expect(cleanUrl(url)).toBe(expected);
});

it('should remove all params if all are tracking params', () => {
const url = 'https://example.com?spm=abc&utm_source=google&fbclid=xyz';
const expected = 'https://example.com/';
expect(cleanUrl(url)).toBe(expected);
});

it('should handle multiple spm-like params', () => {
const url = 'https://example.com?spm=a&scm=b&id=1';
const expected = 'https://example.com/?id=1';
expect(cleanUrl(url)).toBe(expected);
});
});

describe('custom params list', () => {
it('should only remove specified params', () => {
const url = 'https://example.com?spm=abc&custom=123&id=1';
const expected = 'https://example.com/?spm=abc&id=1';
expect(cleanUrl(url, ['custom'])).toBe(expected);
});

it('should handle empty params list', () => {
const url = 'https://example.com?spm=abc&id=1';
// Note: URL object normalizes the URL by adding a trailing slash before query
expect(cleanUrl(url, [])).toBe('https://example.com/?spm=abc&id=1');
});
});
});

describe('DEFAULT_TRACKING_PARAMS', () => {
it('should include spm', () => {
expect(DEFAULT_TRACKING_PARAMS).toContain('spm');
});

it('should include common UTM params', () => {
expect(DEFAULT_TRACKING_PARAMS).toContain('utm_source');
expect(DEFAULT_TRACKING_PARAMS).toContain('utm_medium');
expect(DEFAULT_TRACKING_PARAMS).toContain('utm_campaign');
});

it('should include major platform tracking params', () => {
expect(DEFAULT_TRACKING_PARAMS).toContain('fbclid');
expect(DEFAULT_TRACKING_PARAMS).toContain('gclid');
expect(DEFAULT_TRACKING_PARAMS).toContain('msclkid');
});
});

describe('createUrlCleaner', () => {
it('should create a cleaner with default params', () => {
const cleaner = createUrlCleaner();
const url = 'https://example.com?spm=abc&id=1';
const expected = 'https://example.com/?id=1';
expect(cleaner(url)).toBe(expected);
});

it('should create a cleaner with custom params', () => {
const cleaner = createUrlCleaner(['custom_param']);
const url = 'https://example.com?custom_param=abc&spm=xyz&id=1';
const expected = 'https://example.com/?spm=xyz&id=1';
expect(cleaner(url)).toBe(expected);
});
});
});