Skip to content

Commit 68e1408

Browse files
authored
Merge pull request #7 from mendableai/feature-file-upload
file upload data connector
2 parents ce98665 + 0433d5f commit 68e1408

File tree

11 files changed

+298
-0
lines changed

11 files changed

+298
-0
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ This repository contains a collection of data connectors built by [Mendable AI](
2525

2626
The following connectors are currently available:
2727
- ✅ Text
28+
- ✅ Files (.md, .txt, .pdf, .csv)
2829
- ✅ Web Scraper (single urls, sitemap)
2930
- ✅ Zendesk
3031
- ✅ GitHub (Private and Public repos)

package.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
"@babel/preset-env": "^7.23.8",
4545
"@babel/preset-typescript": "^7.23.3",
4646
"@types/jest": "^29.5.11",
47+
"@types/pdf-parse": "^1.1.4",
4748
"@types/xml2js": "^0.4.14",
4849
"@typescript-eslint/eslint-plugin": "6.19.1",
4950
"@typescript-eslint/parser": "6.19.1",
@@ -64,6 +65,7 @@
6465
"googleapis": "^131.0.0",
6566
"node-html-parser": "^6.1.12",
6667
"octokit": "^3.1.2",
68+
"pdf-parse": "^1.1.1",
6769
"scrapingbee": "^1.7.4",
6870
"tsup": "^8.0.1",
6971
"xml2js": "^0.6.2"

pnpm-lock.yaml

Lines changed: 35 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
id, column1, column2, column3
2+
1, test, 11111, test test
3+
2, test2 test2, 22222, test
4+
3, test3, 33333, test test test
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# This is a test markdown file
2+
3+
This file is used for testing purposes. Below is a list of items:
4+
5+
- Item 1
6+
- Item 2
7+
- Item 3
8+
9+
End of file.
13 KB
Binary file not shown.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
This is a test file.
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<tests>
3+
<test>
4+
<id>1</id>
5+
<column1>test</column1>
6+
<column2>11111</column2>
7+
<column3>test test</column3>
8+
</test>
9+
<test>
10+
<id>2</id>
11+
<column1>test2 test2</column1>
12+
<column2>22222</column2>
13+
<column3>test</column3>
14+
</test>
15+
<test>
16+
<id>3</id>
17+
<column1>test3</column1>
18+
<column2>33333</column2>
19+
<column3>test test test</column3>
20+
</test>
21+
</tests>
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
import { createDataConnector } from "../../../DataConnector";
2+
3+
describe('FileDataProvider', () => {
4+
it('should return correct documents', async () => {
5+
const fileDataConnector = createDataConnector({ provider: 'file' });
6+
7+
await fileDataConnector.setOptions({
8+
files: [
9+
'./src/__tests__/providers/File/files/test.csv',
10+
'./src/__tests__/providers/File/files/test.md',
11+
'./src/__tests__/providers/File/files/test.pdf',
12+
'./src/__tests__/providers/File/files/test.txt',
13+
'./src/__tests__/providers/File/files/test.xml'
14+
],
15+
});
16+
17+
const documents = await fileDataConnector.getDocuments();
18+
expect(documents).not.toBe(null);
19+
expect(documents.length).toBe(5);
20+
expect(documents[0].content).not.toBe(null);
21+
expect(documents[0].content.length).toBeGreaterThan(0);
22+
expect(documents).toEqual([
23+
{
24+
content: 'id, column1, column2, column3\n1, test, 11111, test test\n2, test2 test2, 22222, test\n3, test3, 33333, test test test',
25+
metadata: { sourceURL: expect.stringMatching(/^#FILE_\d+$/) },
26+
provider: 'file',
27+
type: 'csv'
28+
},
29+
{
30+
content: '# This is a test markdown file\n\nThis file is used for testing purposes. Below is a list of items:\n\n- Item 1\n- Item 2\n- Item 3\n\nEnd of file.\n',
31+
metadata: { sourceURL: expect.stringMatching(/^#FILE_\d+$/) },
32+
provider: 'file',
33+
type: 'md'
34+
},
35+
{
36+
content: '\n\nDummy PDF file',
37+
metadata: { sourceURL: expect.stringMatching(/^#FILE_\d+$/) },
38+
provider: 'file',
39+
type: 'pdf'
40+
},
41+
{
42+
content: 'This is a test file.\n',
43+
metadata: { sourceURL: expect.stringMatching(/^#FILE_\d+$/) },
44+
provider: 'file',
45+
type: 'txt'
46+
},
47+
{
48+
content: '<?xml version="1.0" encoding="UTF-8"?>\n<tests>\n <test>\n <id>1</id>\n <column1>test</column1>\n <column2>11111</column2>\n <column3>test test</column3>\n </test>\n <test>\n <id>2</id>\n <column1>test2 test2</column1>\n <column2>22222</column2>\n <column3>test</column3>\n </test>\n <test>\n <id>3</id>\n <column1>test3</column1>\n <column2>33333</column2>\n <column3>test test test</column3>\n </test>\n</tests>\n',
49+
metadata: { sourceURL: expect.stringMatching(/^#FILE_\d+$/) },
50+
provider: 'file',
51+
type: 'xml'
52+
}
53+
]);
54+
});
55+
56+
it('should fetch documents from URLs', async () => {
57+
const fileUrlDataConnector = createDataConnector({ provider: 'file' });
58+
59+
const optionsURLs = {
60+
urls: [
61+
'https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test.csv',
62+
'https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test.md',
63+
'https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf',
64+
'https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test.txt',
65+
'https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test.xml'
66+
]
67+
}
68+
69+
await fileUrlDataConnector.setOptions(optionsURLs);
70+
const documentsByURL = await fileUrlDataConnector.getDocuments();
71+
72+
expect(documentsByURL).not.toBe(null);
73+
expect(documentsByURL.length).toBe(5);
74+
expect(documentsByURL[0].content).not.toBe(null);
75+
expect(documentsByURL[0].content.length).toBeGreaterThan(0);
76+
expect(documentsByURL[0].metadata.sourceURL).not.toBe(null);
77+
expect(documentsByURL[0].provider).toBe('file');
78+
expect(documentsByURL).toContainEqual({
79+
content: 'id, column1, column2, column3\n1, test, 11111, test test\n2, test2 test2, 22222, test\n3, test3, 33333, test test test\n',
80+
metadata: { sourceURL: optionsURLs.urls[0] },
81+
provider: 'file',
82+
type: 'csv'
83+
});
84+
expect(documentsByURL).toContainEqual({
85+
content: expect.stringContaining('# This is a test markdown file\n\nThis file is used for testing purposes. Below is a list of items:\n\n- Item 1\n- Item 2\n- Item 3\n\nEnd of file.\n'),
86+
metadata: { sourceURL: optionsURLs.urls[1] },
87+
provider: 'file',
88+
type: 'md'
89+
});
90+
expect(documentsByURL).toContainEqual({
91+
content: expect.stringContaining('Dummy PDF file'),
92+
metadata: { sourceURL: optionsURLs.urls[2] },
93+
provider: 'file',
94+
type: 'pdf'
95+
});
96+
expect(documentsByURL).toContainEqual({
97+
content: expect.stringContaining('This is a test file.'),
98+
metadata: { sourceURL: optionsURLs.urls[3] },
99+
provider: 'file',
100+
type: 'txt'
101+
});
102+
expect(documentsByURL).toContainEqual({
103+
content: expect.stringContaining('<?xml version="1.0" encoding="UTF-8"?>\n<tests>\n <test>\n <id>1</id>\n <column1>test</column1>\n <column2>11111</column2>\n <column3>test test</column3>\n </test>\n <test>\n <id>2</id>\n <column1>test2 test2</column1>\n <column2>22222</column2>\n <column3>test</column3>\n </test>\n <test>\n <id>3</id>\n <column1>test3</column1>\n <column2>33333</column2>\n <column3>test test test</column3>\n </test>\n</tests>'),
104+
metadata: { sourceURL: optionsURLs.urls[4] },
105+
provider: 'file',
106+
type: 'xml'
107+
});
108+
});
109+
});

src/providers/File/index.ts

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
import { DataProvider } from "../DataProvider";
2+
import { Document } from "../../entities/Document";
3+
import fs from "fs";
4+
import pdf from 'pdf-parse';
5+
6+
export type FileInputOptions = {
7+
files?: string[];
8+
urls?: string[];
9+
};
10+
11+
export class FileDataProvider implements DataProvider<FileInputOptions> {
12+
private files: string[] = [];
13+
private urls: string[] = [];
14+
15+
authorize(): void {
16+
// no need
17+
return;
18+
}
19+
20+
async getDocuments(): Promise<Document[]> {
21+
const documents: Document[] = [];
22+
let content = "";
23+
let fileType = "";
24+
25+
if (this.files.length > 0) {
26+
for (const file of this.files) {
27+
try {
28+
fileType = file.split('.').pop() || "";
29+
if (fileType === 'pdf') {
30+
const fileContent = fs.readFileSync(file);
31+
const data = await pdf(fileContent);
32+
content = data.text
33+
} else {
34+
const fileContent = fs.readFileSync(file, { encoding: 'utf8' });
35+
content = fileContent;
36+
}
37+
} catch (error) {
38+
throw new Error(`Error reading file ${file}: ${error}`);
39+
}
40+
41+
const randomNumber = Math.floor(Math.random() * 100000000);
42+
documents.push({
43+
content,
44+
metadata: {
45+
sourceURL: "#FILE_" + randomNumber.toString(),
46+
},
47+
provider: "file",
48+
type: fileType
49+
});
50+
}
51+
} else if (this.urls.length > 0) {
52+
for (const url of this.urls) {
53+
try {
54+
const response = await fetch(url);
55+
if (response.ok) {
56+
fileType = url.split('.').pop() || "";
57+
58+
if (fileType === 'pdf') {
59+
const arrayBuffer = await response.arrayBuffer();
60+
const buffer = Buffer.from(new Uint8Array(arrayBuffer));
61+
const data = await pdf(buffer);
62+
content = data.text;
63+
} else {
64+
const urlContent = await response.text();
65+
content = urlContent + "\n";
66+
}
67+
} else {
68+
throw new Error(`Error fetching URL ${url}: ${response.statusText}`);
69+
}
70+
} catch (error) {
71+
throw new Error(`Error fetching URL ${url}: ${error}`);
72+
}
73+
74+
documents.push({
75+
content,
76+
metadata: {
77+
sourceURL: url,
78+
},
79+
provider: "file",
80+
type: fileType
81+
});
82+
}
83+
}
84+
return documents;
85+
}
86+
87+
async authorizeNango(): Promise<void> {
88+
// no need
89+
return;
90+
}
91+
92+
setOptions(options: FileInputOptions): void {
93+
if (!options.files && !options.urls) {
94+
throw new Error("Either a file path or a URL must be provided");
95+
}
96+
if (options.files && options.urls) {
97+
throw new Error("Only one of file paths or URLs can be provided");
98+
}
99+
if (options.files) {
100+
this.files = options.files;
101+
this.urls = [];
102+
}
103+
if (options.urls) {
104+
this.urls = options.urls;
105+
this.files = [];
106+
}
107+
}
108+
}

0 commit comments

Comments
 (0)