Skip to content

Commit 931ef44

Browse files
authored
feat(input_schema): Validate regexp patterns (#553)
Solves apify/apify-core#23455: For patterns in `pattern` (string) and `patternKey` an `patternValue` (object and array) do: - check if the regex can be compiled (`new RegExp(pattern)`) - check if the regex is save using the [safe-regex](https://www.npmjs.com/package/safe-regex) package > WARNING: This module has both false positives and false negatives. Use [vuln-regex-detector](https://github.com/davisjam/vuln-regex-detector) for improved accuracy. ^ I think we can live with that? 🤔 (The `vuln-regex-detector` is more complex and query remote server instead of tatic analysis / heuristics) The heuristic used in `safe-regex` is quite easy to understand: https://github.com/davisjam/safe-regex/blob/master/src/heuristic-analyzer.js: ### Star height > 1 > What is Star Height? It’s the maximum nesting depth of repetition operators (*, +, {m,n}) in the regex. > >For example: > >- `a*` → star height = 1 > `(a*)*` → star height = 2 > `((a+)+)*` → star height = 3 > >Why this matters: > >Nested quantifiers like `(a+)+` or `(a|aa)+` are classic ReDoS patterns. >When regex engines try to match them against long strings, they can backtrack exponentially. ### Number of repetitions exceeds a threshold > Threshold is set to 25, can be customized. > This checks how many total quantifiers (`*`, `+`, `{m,n}`) appear in the regex. > If the total count exceeds a configurable limit (this.options.heuristic_replimit), it’s flagged as risky. > > This heuristic is much weaker — many safe regexes can have multiple quantifiers — but it helps detect suspiciously repetitive patterns.
1 parent 2c918b4 commit 931ef44

6 files changed

Lines changed: 234 additions & 4 deletions

File tree

package-lock.json

Lines changed: 27 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/input_schema/package.json

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,13 @@
5252
"@apify/input_secrets": "^1.2.9",
5353
"@apify/json_schemas": "^0.5.1",
5454
"acorn-loose": "^8.4.0",
55-
"countries-list": "^3.0.0"
55+
"countries-list": "^3.0.0",
56+
"safe-regex": "^2.1.1"
5657
},
5758
"peerDependencies": {
5859
"ajv": "^8.0.0"
60+
},
61+
"devDependencies": {
62+
"@types/safe-regex": "^1.1.6"
5963
}
6064
}

packages/input_schema/src/input_schema.ts

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,15 @@ import { inputSchema as schema } from '@apify/json_schemas';
55

66
import { m } from './intl';
77
import type {
8+
ArrayFieldDefinition,
89
CommonResourceFieldDefinition,
910
FieldDefinition,
1011
InputSchema,
1112
InputSchemaBaseChecked,
13+
ObjectFieldDefinition,
1214
StringFieldDefinition,
1315
} from './types';
14-
import { ensureAjvSupportsDraft2019 } from './utilities';
16+
import { ensureAjvSupportsDraft2019, validateRegexpPattern } from './utilities';
1517

1618
export { schema as inputSchema };
1719

@@ -131,7 +133,12 @@ function validateBasicStructure(validator: Ajv, obj: Record<string, unknown>): a
131133
* @param fieldKey Key of the field in the input schema.
132134
* @param isSubField If true, the field is a sub-field of another field, so we need to skip some definitions.
133135
*/
134-
function validateField(validator: Ajv, fieldSchema: Record<string, unknown>, fieldKey: string, isSubField = false): asserts fieldSchema is FieldDefinition {
136+
function validateFieldAgainstSchemaDefinition(
137+
validator: Ajv,
138+
fieldSchema: Record<string, unknown>,
139+
fieldKey: string,
140+
isSubField = false,
141+
): asserts fieldSchema is FieldDefinition {
135142
const relevantDefinitions = isSubField ? subFieldDefinitions : fieldDefinitions;
136143

137144
const matchingDefinitions = Object
@@ -188,6 +195,26 @@ function validateField(validator: Ajv, fieldSchema: Record<string, unknown>, fie
188195
validateAgainstSchemaOrThrow(validator, fieldSchema, enhanceDefinition(definition), `schema.properties.${fieldKey}`);
189196
}
190197

198+
/**
199+
* Validates particular field against it's schema and other rules (like regex patterns).
200+
* @param validator An instance of AJV validator (must support draft 2019-09).
201+
* @param fieldSchema Schema of the field to validate.
202+
* @param fieldKey Key of the field in the input schema.
203+
* @param isSubField If true, the field is a sub-field of another field, so we need to skip some definitions.
204+
*/
205+
function validateField(validator: Ajv, fieldSchema: Record<string, unknown>, fieldKey: string, isSubField = false): asserts fieldSchema is FieldDefinition {
206+
// Validate against schema definition first.
207+
validateFieldAgainstSchemaDefinition(validator, fieldSchema, fieldKey, isSubField);
208+
209+
// Validate regex patterns if defined.
210+
const { pattern } = fieldSchema as Partial<StringFieldDefinition>;
211+
const { patternKey, patternValue } = fieldSchema as Partial<ObjectFieldDefinition & ArrayFieldDefinition>;
212+
213+
if (pattern) validateRegexpPattern(pattern, `${fieldKey}.pattern`);
214+
if (patternKey) validateRegexpPattern(patternKey, `${fieldKey}.patternKey`);
215+
if (patternValue) validateRegexpPattern(patternValue, `${fieldKey}.patternValue`);
216+
}
217+
191218
/**
192219
* Validates all subfields (and their subfields) of a given field schema.
193220
*/

packages/input_schema/src/intl.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@ const intlStrings = {
3535
'Field {rootName}.{fieldKey}.apifyProxyGroups must be an array of strings.',
3636
'inputSchema.validation.secretFieldSchemaChanged':
3737
'The field schema.properties.{fieldKey} is a secret field, but its schema has changed. Please update the value in the input editor.',
38+
'inputSchema.validation.regexpNotValid':
39+
'The regular expression "{pattern}" in field schema.properties.{fieldKey} must be valid.',
40+
'inputSchema.validation.regexpNotSafe':
41+
'The regular expression "{pattern}" in field schema.properties.{fieldKey} may cause excessive backtracking or be unsafe to execute.',
3842
};
3943

4044
/**

packages/input_schema/src/utilities.ts

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import { parse } from 'acorn-loose';
22
import type { ValidateFunction } from 'ajv';
33
import type Ajv from 'ajv/dist/2019';
44
import { countries } from 'countries-list';
5+
import safe from 'safe-regex';
56

67
import { PROXY_URL_REGEX, URL_REGEX } from '@apify/consts';
78
import { isEncryptedValueForFieldSchema, isEncryptedValueForFieldType } from '@apify/input_secrets';
@@ -361,3 +362,26 @@ export function ensureAjvSupportsDraft2019(ajvInstance: Ajv) {
361362
);
362363
}
363364
}
365+
366+
/**
367+
* Validates that the provided pattern is a valid and safe regular expression.
368+
* @param pattern The regular expression pattern to validate.
369+
* @param fieldKey The field key where the pattern is used (for error messages).
370+
*/
371+
export function validateRegexpPattern(pattern: string, fieldKey: string) {
372+
let regex: RegExp;
373+
374+
try {
375+
// Validate that the pattern is a valid regular expression
376+
regex = new RegExp(pattern);
377+
} catch {
378+
const message = m('inputSchema.validation.regexpNotValid', { pattern, fieldKey });
379+
throw new Error(`Input schema is not valid (${message})`);
380+
}
381+
382+
// Check if the regex is safe (to avoid ReDoS attacks)
383+
if (!safe(regex)) {
384+
const message = m('inputSchema.validation.regexpNotSafe', { pattern, fieldKey });
385+
throw new Error(`Input schema is not valid (${message})`);
386+
}
387+
}

test/input_schema.test.ts

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -922,5 +922,150 @@ describe('input_schema.json', () => {
922922
});
923923
});
924924
});
925+
926+
describe('validate pattern regexps', () => {
927+
it('should accept valid regexp', () => {
928+
const schema = {
929+
title: 'Test input schema',
930+
type: 'object',
931+
schemaVersion: 1,
932+
properties: {
933+
myField: {
934+
title: 'Field title',
935+
type: 'string',
936+
description: 'Some description ...',
937+
editor: 'textfield',
938+
pattern: '^[A-Z]{3}$',
939+
},
940+
objectField: {
941+
title: 'Object field',
942+
type: 'object',
943+
description: 'Some description ...',
944+
editor: 'json',
945+
patternKey: '^[a-z]+$',
946+
patternValue: '^[0-9]+$',
947+
},
948+
arrayField: {
949+
title: 'Array field',
950+
type: 'array',
951+
description: 'Some description ...',
952+
editor: 'json',
953+
patternKey: '^[a-z]+$',
954+
patternValue: '^[0-9]+$',
955+
},
956+
},
957+
};
958+
959+
expect(() => validateInputSchema(validator, schema)).not.toThrow();
960+
});
961+
962+
it('should throw error on invalid pattern regexp', () => {
963+
const schema = {
964+
title: 'Test input schema',
965+
type: 'object',
966+
schemaVersion: 1,
967+
properties: {
968+
myField: {
969+
title: 'Field title',
970+
type: 'string',
971+
description: 'Some description ...',
972+
editor: 'textfield',
973+
pattern: '[A-Z{3}', // invalid regexp
974+
},
975+
},
976+
};
977+
978+
expect(() => validateInputSchema(validator, schema)).toThrow(
979+
'Input schema is not valid (The regular expression "[A-Z{3}" in field schema.properties.myField.pattern must be valid.)',
980+
);
981+
});
982+
983+
it('should throw error on invalid patternKey regexp', () => {
984+
const schema = {
985+
title: 'Test input schema',
986+
type: 'object',
987+
schemaVersion: 1,
988+
properties: {
989+
objectField: {
990+
title: 'Object field',
991+
type: 'object',
992+
description: 'Some description ...',
993+
editor: 'json',
994+
patternKey: '[a-z+$', // invalid regexp
995+
patternValue: '^[0-9]+$',
996+
},
997+
},
998+
};
999+
1000+
expect(() => validateInputSchema(validator, schema)).toThrow(
1001+
'Input schema is not valid (The regular expression "[a-z+$" in field schema.properties.objectField.patternKey must be valid.)',
1002+
);
1003+
});
1004+
1005+
it('should throw error on invalid patternValue regexp', () => {
1006+
const schema = {
1007+
title: 'Test input schema',
1008+
type: 'object',
1009+
schemaVersion: 1,
1010+
properties: {
1011+
objectField: {
1012+
title: 'Object field',
1013+
type: 'object',
1014+
description: 'Some description ...',
1015+
editor: 'json',
1016+
patternKey: '^[a-z]+$',
1017+
patternValue: '^[0-9+$', // invalid regexp
1018+
},
1019+
},
1020+
};
1021+
1022+
expect(() => validateInputSchema(validator, schema)).toThrow(
1023+
'Input schema is not valid (The regular expression "^[0-9+$" in field schema.properties.objectField.patternValue must be valid.)',
1024+
);
1025+
});
1026+
1027+
it('should throw error on not safe regexp', () => {
1028+
const invalidRegexps = [
1029+
'(a+)+$',
1030+
'^(a|a?)+$',
1031+
'^(a|a*)+$',
1032+
'^(a|a+)+$',
1033+
'^(a?)+$',
1034+
'^(a*)+$',
1035+
'^(a+)*$',
1036+
'^(a|aa?)+$',
1037+
'^(a|aa*)+$',
1038+
'^(a|a+)*$',
1039+
'^(a|a?)*$',
1040+
'^(a|a*)*$',
1041+
'^(a?)*$',
1042+
'^(a*)*$',
1043+
'^(a+)?$',
1044+
'^(a*)?$',
1045+
'a*b*c*d*e*f*g*h*i*j*k*l*m*n*o*p*q*r*s*t*u*v*w*x*y*z*',
1046+
];
1047+
1048+
for (const pattern of invalidRegexps) {
1049+
const schema = {
1050+
title: 'Test input schema',
1051+
type: 'object',
1052+
schemaVersion: 1,
1053+
properties: {
1054+
myField: {
1055+
title: 'Field title',
1056+
type: 'string',
1057+
description: 'Some description ...',
1058+
editor: 'textfield',
1059+
pattern,
1060+
},
1061+
},
1062+
};
1063+
1064+
expect(() => validateInputSchema(validator, schema)).toThrow(
1065+
`Input schema is not valid (The regular expression "${pattern}" in field schema.properties.myField.pattern may cause excessive backtracking or be unsafe to execute.)`,
1066+
);
1067+
}
1068+
});
1069+
});
9251070
});
9261071
});

0 commit comments

Comments
 (0)