From b4d6039b748768d661cef95a56429a5eabaf5b7a Mon Sep 17 00:00:00 2001 From: "kunlun.ykl" Date: Wed, 4 Mar 2026 16:26:09 +0800 Subject: [PATCH] feat: add URL tracking parameters cleaner - Add url-cleaner module to remove tracking params (spm, utm_*, fbclid, gclid, etc.) - Integrate URL cleaning in LinkExportService and TabExportService - Add comprehensive unit tests for URL cleaner (25 tests) --- src/lib/url-cleaner.ts | 113 +++++++++++++++ src/services/link-export-service.ts | 8 +- src/services/tab-export-service.ts | 6 +- test/services/link-export-service.test.ts | 8 +- test/url-cleaner.test.ts | 168 ++++++++++++++++++++++ 5 files changed, 297 insertions(+), 6 deletions(-) create mode 100644 src/lib/url-cleaner.ts create mode 100644 test/url-cleaner.test.ts diff --git a/src/lib/url-cleaner.ts b/src/lib/url-cleaner.ts new file mode 100644 index 00000000..643acf70 --- /dev/null +++ b/src/lib/url-cleaner.ts @@ -0,0 +1,113 @@ +/** + * URL Cleaner - Removes tracking parameters from URLs + * + * This module provides functionality to clean URLs by removing + * common tracking parameters (e.g., spm, utm_*, etc.) + */ + +/** + * Default list of tracking parameters to remove from URLs. + * These are common tracking parameters used by various platforms. + */ +export const DEFAULT_TRACKING_PARAMS: string[] = [ + // Alibaba/Aliyun tracking + 'spm', + 'scm', + // DingTalk tracking + '_dt_ac', + '_dt_sig', + '_dt_ts', + // Google Analytics UTM parameters + 'utm_source', + 'utm_medium', + 'utm_campaign', + 'utm_term', + 'utm_content', + // Facebook + 'fbclid', + // Google Ads + 'gclid', + 'gclsrc', + // Microsoft/Bing + 'msclkid', + // Twitter + 'twclid', + // TikTok + 'ttclid', + // Other common tracking params + '_ga', + '_gl', + 'mc_cid', + 'mc_eid', +]; + +/** + * Removes specified tracking parameters from a URL. + * + * @param url - The URL to clean + * @param paramsToRemove - List of parameter names to remove (case-insensitive) + * @returns The cleaned URL, or the original URL if parsing fails + * + * @example + * ```typescript + * cleanUrl('https://example.com?spm=123&id=456') + * // Returns: 'https://example.com?id=456' + * + * cleanUrl('https://example.com?utm_source=google&page=1', ['utm_source']) + * // Returns: 'https://example.com?page=1' + * ``` + */ +export function cleanUrl( + url: string, + paramsToRemove: string[] = DEFAULT_TRACKING_PARAMS, +): string { + // Return original URL if it's empty or invalid + if (!url || typeof url !== 'string') { + return url; + } + + try { + const urlObj = new URL(url); + + // Create a set of lowercase param names for case-insensitive matching + const paramsToRemoveSet = new Set( + paramsToRemove.map(p => p.toLowerCase()), + ); + + // Collect params to delete (can't modify while iterating) + const keysToDelete: string[] = []; + urlObj.searchParams.forEach((_, key) => { + if (paramsToRemoveSet.has(key.toLowerCase())) { + keysToDelete.push(key); + } + }); + + // Delete the tracking params + keysToDelete.forEach(key => urlObj.searchParams.delete(key)); + + return urlObj.toString(); + } + catch { + // If URL parsing fails, return the original URL unchanged + return url; + } +} + +/** + * Creates a URL cleaner function with pre-configured parameters. + * + * @param paramsToRemove - List of parameter names to remove + * @returns A function that cleans URLs using the specified parameters + * + * @example + * ```typescript + * const cleaner = createUrlCleaner(['spm', 'utm_source']); + * cleaner('https://example.com?spm=123&id=456'); + * // Returns: 'https://example.com?id=456' + * ``` + */ +export function createUrlCleaner( + paramsToRemove: string[] = DEFAULT_TRACKING_PARAMS, +): (url: string) => string { + return (url: string) => cleanUrl(url, paramsToRemove); +} diff --git a/src/services/link-export-service.ts b/src/services/link-export-service.ts index dec220d3..9e9ff147 100644 --- a/src/services/link-export-service.ts +++ b/src/services/link-export-service.ts @@ -1,4 +1,5 @@ import type { CustomFormatsProvider, MarkdownFormatter } from './shared-types.js'; +import { cleanUrl } from '../lib/url-cleaner.js'; // Type Definitions export type LinkExportFormat = 'link' | 'custom-format'; @@ -56,16 +57,19 @@ export class LinkExportService { // Validate options validateLinkExportOptions(options); + // Clean URL to remove tracking parameters + const cleanedUrl = cleanUrl(options.url); + // Route to appropriate formatter switch (options.format) { case 'link': - return this.markdown.linkTo(options.title, options.url); + return this.markdown.linkTo(options.title, cleanedUrl); case 'custom-format': // We already validated that customFormatSlot exists return renderCustomFormatLink( options.title, - options.url, + cleanedUrl, options.customFormatSlot!, // TODO: implement flexible title formatter. // See https://github.com/yorkxin/copy-as-markdown/issues/133 diff --git a/src/services/tab-export-service.ts b/src/services/tab-export-service.ts index 378c3a79..2b00fe9e 100644 --- a/src/services/tab-export-service.ts +++ b/src/services/tab-export-service.ts @@ -5,6 +5,7 @@ import { Tab, TabGroup, TabListGrouper } from '../lib/tabs.js'; import CustomFormatClass from '../lib/custom-format.js'; import type { CustomFormatsProvider, MarkdownFormatter } from './shared-types.js'; import { createBrowserTabDataFetcher } from './browser-tab-data-fetcher.js'; +import { cleanUrl } from '../lib/url-cleaner.js'; export type ExportFormat = 'link' | 'title' | 'url' | 'custom-format'; export type ListType = 'list' | 'task-list'; @@ -35,10 +36,11 @@ export function validateOptions(options: ExportTabsOptions): void { export function convertBrowserTabsToTabs( browserTabs: chrome.tabs.Tab[], escapeLinkText: (text: string) => string, + urlCleaner: (url: string) => string = url => url, ): Tab[] { return browserTabs.map(tab => new Tab( escapeLinkText(tab.title || ''), - tab.url || '', + urlCleaner(tab.url || ''), tab.groupId || TabGroup.NonGroupId, )); } @@ -193,9 +195,11 @@ export class TabExportService { const browserGroups = await this.tabDataFetcher.fetchTabGroups(options.windowId); // Convert and process (pure functions) + // Clean URLs to remove tracking parameters const tabs = convertBrowserTabsToTabs( browserTabs, text => this.markdown.escapeLinkText(text), + cleanUrl, ); const groups = convertBrowserTabGroups(browserGroups); const tabLists = groupTabsIntoLists(tabs, groups); diff --git a/test/services/link-export-service.test.ts b/test/services/link-export-service.test.ts index 073e1974..b0d3ad01 100644 --- a/test/services/link-export-service.test.ts +++ b/test/services/link-export-service.test.ts @@ -127,7 +127,8 @@ describe('linkExportService', () => { url: 'https://example.com', }); - expect(result).toBe('[Example](https://example.com)'); + // Note: URL is normalized by cleanUrl (adds trailing slash) + expect(result).toBe('[Example](https://example.com/)'); expect(mockProvider.get).not.toHaveBeenCalled(); }); @@ -169,11 +170,12 @@ describe('linkExportService', () => { customFormatSlot: '1', }); - expect(result).toBe('Custom: Test \\[Link\\] -> https://example.com'); + // Note: URL is normalized by cleanUrl (adds trailing slash) + expect(result).toBe('Custom: Test \\[Link\\] -> https://example.com/'); expect(mockProvider.get).toHaveBeenCalledWith('single-link', '1'); expect(mockCustomFormat.render).toHaveBeenCalledWith({ title: 'Test \\[Link\\]', - url: 'https://example.com', + url: 'https://example.com/', number: 1, }); }); diff --git a/test/url-cleaner.test.ts b/test/url-cleaner.test.ts new file mode 100644 index 00000000..2c2368af --- /dev/null +++ b/test/url-cleaner.test.ts @@ -0,0 +1,168 @@ +import { describe, expect, it } from 'vitest'; +import { cleanUrl, createUrlCleaner, DEFAULT_TRACKING_PARAMS } from '../src/lib/url-cleaner.js'; + +describe('url-cleaner', () => { + describe('cleanUrl', () => { + describe('removes spm parameter', () => { + it('should remove spm from Aliyun URL', () => { + const url = 'https://www.aliyun.com/exp/llm?spm=5176.42028462.J_zsjI5GlfrwE0jnH6hsdfr.3.e939154ai51unt'; + const expected = 'https://www.aliyun.com/exp/llm'; + expect(cleanUrl(url)).toBe(expected); + }); + + it('should remove spm and keep other params from Alibaba URL', () => { + const url = 'https://fbi.alibaba-inc.com/dashboard/view/page.htm?spm=a2o1z.8190073.0.0.d7a0543flHuS4p&id=1392902'; + const expected = 'https://fbi.alibaba-inc.com/dashboard/view/page.htm?id=1392902'; + expect(cleanUrl(url)).toBe(expected); + }); + + it('should handle spm in the middle of params', () => { + const url = 'https://example.com?id=1&spm=abc123&page=2'; + const expected = 'https://example.com/?id=1&page=2'; + expect(cleanUrl(url)).toBe(expected); + }); + }); + + describe('removes UTM parameters', () => { + it('should remove all UTM params', () => { + const url = 'https://example.com?utm_source=google&utm_medium=cpc&utm_campaign=test&id=123'; + const expected = 'https://example.com/?id=123'; + expect(cleanUrl(url)).toBe(expected); + }); + + it('should remove utm_term and utm_content', () => { + const url = 'https://example.com?utm_term=keyword&utm_content=banner&page=1'; + const expected = 'https://example.com/?page=1'; + expect(cleanUrl(url)).toBe(expected); + }); + }); + + describe('removes other tracking parameters', () => { + it('should remove Facebook fbclid', () => { + const url = 'https://example.com?fbclid=IwAR3xyz&article=123'; + const expected = 'https://example.com/?article=123'; + expect(cleanUrl(url)).toBe(expected); + }); + + it('should remove Google gclid', () => { + const url = 'https://example.com?gclid=abc123&product=456'; + const expected = 'https://example.com/?product=456'; + expect(cleanUrl(url)).toBe(expected); + }); + + it('should remove Microsoft msclkid', () => { + const url = 'https://example.com?msclkid=xyz789&item=100'; + const expected = 'https://example.com/?item=100'; + expect(cleanUrl(url)).toBe(expected); + }); + + it('should remove scm parameter', () => { + const url = 'https://example.com?scm=abc&id=1'; + const expected = 'https://example.com/?id=1'; + expect(cleanUrl(url)).toBe(expected); + }); + }); + + describe('case insensitivity', () => { + it('should remove SPM (uppercase)', () => { + const url = 'https://example.com?SPM=abc123&id=1'; + const expected = 'https://example.com/?id=1'; + expect(cleanUrl(url)).toBe(expected); + }); + + it('should remove UTM_SOURCE (uppercase)', () => { + const url = 'https://example.com?UTM_SOURCE=google&id=1'; + const expected = 'https://example.com/?id=1'; + expect(cleanUrl(url)).toBe(expected); + }); + }); + + describe('edge cases', () => { + it('should return original URL if no tracking params', () => { + const url = 'https://example.com?id=123&page=2'; + // Note: URL object normalizes the URL by adding a trailing slash before query + expect(cleanUrl(url)).toBe('https://example.com/?id=123&page=2'); + }); + + it('should handle URL without query params', () => { + const url = 'https://example.com/page'; + expect(cleanUrl(url)).toBe('https://example.com/page'); + }); + + it('should handle empty string', () => { + expect(cleanUrl('')).toBe(''); + }); + + it('should handle invalid URL gracefully', () => { + const invalidUrl = 'not-a-valid-url'; + expect(cleanUrl(invalidUrl)).toBe(invalidUrl); + }); + + it('should handle URL with hash', () => { + const url = 'https://example.com?spm=abc#section'; + const expected = 'https://example.com/#section'; + expect(cleanUrl(url)).toBe(expected); + }); + + it('should remove all params if all are tracking params', () => { + const url = 'https://example.com?spm=abc&utm_source=google&fbclid=xyz'; + const expected = 'https://example.com/'; + expect(cleanUrl(url)).toBe(expected); + }); + + it('should handle multiple spm-like params', () => { + const url = 'https://example.com?spm=a&scm=b&id=1'; + const expected = 'https://example.com/?id=1'; + expect(cleanUrl(url)).toBe(expected); + }); + }); + + describe('custom params list', () => { + it('should only remove specified params', () => { + const url = 'https://example.com?spm=abc&custom=123&id=1'; + const expected = 'https://example.com/?spm=abc&id=1'; + expect(cleanUrl(url, ['custom'])).toBe(expected); + }); + + it('should handle empty params list', () => { + const url = 'https://example.com?spm=abc&id=1'; + // Note: URL object normalizes the URL by adding a trailing slash before query + expect(cleanUrl(url, [])).toBe('https://example.com/?spm=abc&id=1'); + }); + }); + }); + + describe('DEFAULT_TRACKING_PARAMS', () => { + it('should include spm', () => { + expect(DEFAULT_TRACKING_PARAMS).toContain('spm'); + }); + + it('should include common UTM params', () => { + expect(DEFAULT_TRACKING_PARAMS).toContain('utm_source'); + expect(DEFAULT_TRACKING_PARAMS).toContain('utm_medium'); + expect(DEFAULT_TRACKING_PARAMS).toContain('utm_campaign'); + }); + + it('should include major platform tracking params', () => { + expect(DEFAULT_TRACKING_PARAMS).toContain('fbclid'); + expect(DEFAULT_TRACKING_PARAMS).toContain('gclid'); + expect(DEFAULT_TRACKING_PARAMS).toContain('msclkid'); + }); + }); + + describe('createUrlCleaner', () => { + it('should create a cleaner with default params', () => { + const cleaner = createUrlCleaner(); + const url = 'https://example.com?spm=abc&id=1'; + const expected = 'https://example.com/?id=1'; + expect(cleaner(url)).toBe(expected); + }); + + it('should create a cleaner with custom params', () => { + const cleaner = createUrlCleaner(['custom_param']); + const url = 'https://example.com?custom_param=abc&spm=xyz&id=1'; + const expected = 'https://example.com/?spm=xyz&id=1'; + expect(cleaner(url)).toBe(expected); + }); + }); +});