Skip to content

Commit d8e1732

Browse files
tsimonsKyleAMathews
authored andcommitted
Queue requests from createRemoteFileNode and control concurrency of requests (gatsbyjs#4616)
* Chunk the requests to download media objects from WP. The blog I work on has over 9,000 media objects and currently, it tries to download them all. This PR chunks them in groups of 100, but that setting can be increased. * Remove prettier forrmatting from the readme * Clean up and document create-remote-file-node Add Better Queue for more control over processing * Rollback changes to wp source files * Add queue for requesting wp objects update readme with new config option * Revert files to master * No longer throw an exception when an error occurs. Just resolve with null and move on * Remove file lock lookup for now. 200 concurrent requests is a safe number and we can look to change this in the future * Cosmoetic updates * Remove console.log
1 parent aa03b9f commit d8e1732

File tree

1 file changed

+250
-87
lines changed

1 file changed

+250
-87
lines changed

src/create-remote-file-node.js

Lines changed: 250 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -3,112 +3,275 @@ const got = require(`got`)
33
const crypto = require(`crypto`)
44
const path = require(`path`)
55
const { isWebUri } = require(`valid-url`)
6+
const Queue = require(`better-queue`)
67

78
const { createFileNode } = require(`./create-file-node`)
89
const cacheId = url => `create-remote-file-node-${url}`
910

11+
/********************
12+
* Type Definitions *
13+
********************/
14+
1015
/**
11-
* Index of promises resolving to File node from remote url
16+
* @typedef {Redux}
17+
* @see [Redux Docs]{@link https://redux.js.org/api-reference}
1218
*/
13-
const processingCache = {}
1419

15-
module.exports = ({ url, store, cache, createNode, auth = {} }) => {
16-
// Check if we already requested node for this remote file
17-
// and return stored promise if we did.
18-
if (processingCache[url]) {
19-
return processingCache[url]
20+
/**
21+
* @typedef {GatsbyCache}
22+
* @see gatsby/packages/gatsby/utils/cache.js
23+
*/
24+
25+
/**
26+
* @typedef {Auth}
27+
* @type {Object}
28+
* @property {String} htaccess_pass
29+
* @property {String} htaccess_user
30+
*/
31+
32+
/**
33+
* @typedef {CreateRemoteFileNodePayload}
34+
* @typedef {Object}
35+
* @description Create Remote File Node Payload
36+
*
37+
* @param {String} options.url
38+
* @param {Redux} options.store
39+
* @param {GatsbyCache} options.cache
40+
* @param {Function} options.createNode
41+
* @param {Auth} [options.auth]
42+
*/
43+
44+
/*********
45+
* utils *
46+
*********/
47+
48+
/**
49+
* createHash
50+
* --
51+
*
52+
* Create an md5 hash of the given str
53+
* @param {Stringq} str
54+
* @return {String}
55+
*/
56+
const createHash = (str) => crypto
57+
.createHash(`md5`)
58+
.update(str)
59+
.digest(`hex`)
60+
61+
const CACHE_DIR = `.cache`
62+
const FS_PLUGIN_DIR = `gatsby-source-filesystem`
63+
64+
/**
65+
* createFilePath
66+
* --
67+
*
68+
* @param {String} directory
69+
* @param {String} filename
70+
* @param {String} url
71+
* @return {String}
72+
*/
73+
const createFilePath = (directory, filename, ext) => path.join(
74+
directory,
75+
CACHE_DIR,
76+
FS_PLUGIN_DIR,
77+
`${filename}${ext}`
78+
)
79+
80+
/********************
81+
* Queue Management *
82+
********************/
83+
84+
/**
85+
* Queue
86+
* Use the task's url as the id
87+
* When pushing a task with a similar id, prefer the original task
88+
* as it's already in the processing cache
89+
*/
90+
const queue = new Queue(pushToQueue, {
91+
id: `url`,
92+
merge: (old, _, cb) => cb(old),
93+
concurrent: 200,
94+
})
95+
96+
/**
97+
* @callback {Queue~queueCallback}
98+
* @param {*} error
99+
* @param {*} result
100+
*/
101+
102+
/**
103+
* pushToQueue
104+
* --
105+
* Handle tasks that are pushed in to the Queue
106+
*
107+
*
108+
* @param {CreateRemoteFileNodePayload} task
109+
* @param {Queue~queueCallback} cb
110+
* @return {Promise<null>}
111+
*/
112+
async function pushToQueue (task, cb) {
113+
try {
114+
const node = await processRemoteNode(task)
115+
return cb(null, node)
116+
} catch (e) {
117+
return cb(null, e)
20118
}
119+
}
21120

22-
return (processingCache[url] = new Promise(async (resolve, reject) => {
23-
if (!url || isWebUri(url) === undefined) {
24-
resolve()
25-
return
26-
}
121+
/******************
122+
* Core Functions *
123+
******************/
124+
125+
/**
126+
* requestRemoteNode
127+
* --
128+
* Download the requested file
129+
*
130+
* @param {String} url
131+
* @param {Headers} headers
132+
* @param {String} tmpFilename
133+
* @param {String} filename
134+
* @return {Promise<Object>} Resolves with the [http Result Object]{@link https://nodejs.org/api/http.html#http_class_http_serverresponse}
135+
*/
136+
const requestRemoteNode = (url, headers, tmpFilename, filename) => new Promise((resolve, reject) => {
137+
const responseStream = got.stream(url, { ...headers, timeout: 30000 })
138+
responseStream.pipe(fs.createWriteStream(tmpFilename))
139+
responseStream.on(`downloadProgress`, pro => console.log(pro))
140+
141+
// If there's a 400/500 response or other error.
142+
responseStream.on(`error`, (error, body, response) => {
143+
fs.removeSync(tmpFilename)
144+
reject({ error, body, response })
145+
})
27146

28-
// Ensure our cache directory exists.
29-
await fs.ensureDir(
30-
path.join(
31-
store.getState().program.directory,
32-
`.cache`,
33-
`gatsby-source-filesystem`
34-
)
147+
responseStream.on(`response`, response => {
148+
resolve(response)
149+
})
150+
})
151+
152+
/**
153+
* processRemoteNode
154+
* --
155+
* Request the remote file and return the fileNode
156+
*
157+
* @param {CreateRemoteFileNodePayload} options
158+
* @return {Promise<Object>} Resolves with the fileNode
159+
*/
160+
async function processRemoteNode ({ url, store, cache, createNode, auth = {} }) {
161+
// Ensure our cache directory exists.
162+
const programDir = store.getState().program.directory
163+
await fs.ensureDir(
164+
path.join(
165+
programDir,
166+
CACHE_DIR,
167+
FS_PLUGIN_DIR
35168
)
169+
)
36170

37-
// See if there's response headers for this url
38-
// from a previous request.
39-
const cachedHeaders = await cache.get(cacheId(url))
40-
const headers = {}
171+
// See if there's response headers for this url
172+
// from a previous request.
173+
const cachedHeaders = await cache.get(cacheId(url))
174+
const headers = {}
41175

42-
// Add htaccess authentication if passed in. This isn't particularly
43-
// extensible. We should define a proper API that we validate.
44-
if (auth && auth.htaccess_pass && auth.htaccess_user) {
45-
headers.auth = `${auth.htaccess_user}:${auth.htaccess_pass}`
46-
}
176+
// Add htaccess authentication if passed in. This isn't particularly
177+
// extensible. We should define a proper API that we validate.
178+
if (auth && auth.htaccess_pass && auth.htaccess_user) {
179+
headers.auth = `${auth.htaccess_user}:${auth.htaccess_pass}`
180+
}
47181

48-
if (cachedHeaders && cachedHeaders.etag) {
49-
headers[`If-None-Match`] = cachedHeaders.etag
50-
}
182+
if (cachedHeaders && cachedHeaders.etag) {
183+
headers[`If-None-Match`] = cachedHeaders.etag
184+
}
51185

52-
// Create the temp and permanent file names for the url.
53-
const digest = crypto
54-
.createHash(`md5`)
55-
.update(url)
56-
.digest(`hex`)
57-
const tmpFilename = path.join(
58-
store.getState().program.directory,
59-
`.cache`,
60-
`gatsby-source-filesystem`,
61-
`tmp-` + digest + path.parse(url).ext
62-
)
63-
const filename = path.join(
64-
store.getState().program.directory,
65-
`.cache`,
66-
`gatsby-source-filesystem`,
67-
digest + path.parse(url).ext
68-
)
186+
// Create the temp and permanent file names for the url.
187+
const digest = createHash(url)
188+
const ext = path.parse(url).ext
69189

70-
// Fetch the file.
71-
let statusCode
72-
let responseHeaders
73-
let responseError = false
74-
const responseStream = got.stream(url, headers)
75-
responseStream.pipe(fs.createWriteStream(tmpFilename))
76-
responseStream.on(`downloadProgress`, pro => console.log(pro))
77-
78-
// If there's a 400/500 response or other error.
79-
responseStream.on(`error`, (error, body, response) => {
80-
responseError = true
81-
fs.removeSync(tmpFilename)
82-
reject(error, body, response)
83-
})
190+
const tmpFilename = createFilePath(programDir, `tmp-${digest}`, ext)
191+
const filename = createFilePath(programDir, digest, ext)
192+
193+
// Fetch the file.
194+
try {
195+
const response = await requestRemoteNode(url, headers, tmpFilename, filename)
196+
// Save the response headers for future requests.
197+
cache.set(cacheId(url), response.headers)
84198

85199
// If the status code is 200, move the piped temp file to the real name.
200+
if (response.statusCode === 200) {
201+
await fs.move(tmpFilename, filename, { overwrite: true })
86202
// Else if 304, remove the empty response.
87-
responseStream.on(`response`, response => {
88-
statusCode = response.statusCode
89-
responseHeaders = response.headers
90-
})
203+
} else {
204+
await fs.remove(tmpFilename)
205+
}
206+
207+
// Create the file node.
208+
const fileNode = await createFileNode(filename, {})
209+
210+
// Override the default plugin as gatsby-source-filesystem needs to
211+
// be the owner of File nodes or there'll be conflicts if any other
212+
// File nodes are created through normal usages of
213+
// gatsby-source-filesystem.
214+
createNode(fileNode, { name: `gatsby-source-filesystem` })
91215

92-
responseStream.on(`end`, response => {
93-
if (responseError) return
94-
95-
// Save the response headers for future requests.
96-
cache.set(cacheId(url), responseHeaders)
97-
if (statusCode === 200) {
98-
fs.moveSync(tmpFilename, filename, { overwrite: true })
99-
} else {
100-
fs.removeSync(tmpFilename)
101-
}
102-
103-
// Create the file node and return.
104-
createFileNode(filename, {}).then(fileNode => {
105-
// Override the default plugin as gatsby-source-filesystem needs to
106-
// be the owner of File nodes or there'll be conflicts if any other
107-
// File nodes are created through normal usages of
108-
// gatsby-source-filesystem.
109-
createNode(fileNode, { name: `gatsby-source-filesystem` })
110-
resolve(fileNode)
111-
})
216+
return fileNode
217+
} catch (err) {
218+
// ignore
219+
}
220+
return null
221+
}
222+
223+
/**
224+
* Index of promises resolving to File node from remote url
225+
*/
226+
const processingCache = {}
227+
/**
228+
* pushTask
229+
* --
230+
* pushes a task in to the Queue and the processing cache
231+
*
232+
* Promisfy a task in queue
233+
* @param {CreateRemoteFileNodePayload} task
234+
* @return {Promise<Object>}
235+
*/
236+
const pushTask = (task) => new Promise((resolve, reject) => {
237+
queue
238+
.push(task)
239+
.on(`finish`, (task) => {
240+
resolve(task)
241+
})
242+
.on(`failed`, () => {
243+
resolve()
112244
})
113-
}))
245+
})
246+
247+
/***************
248+
* Entry Point *
249+
***************/
250+
251+
/**
252+
* createRemoteFileNode
253+
* --
254+
*
255+
* Download a remote file
256+
* First checks cache to ensure duplicate requests aren't processed
257+
* Then pushes to a queue
258+
*
259+
* @param {CreateRemoteFileNodePayload} options
260+
* @return {Promise<Object>} Returns the created node
261+
*/
262+
module.exports = ({ url, store, cache, createNode, auth = {} }) => {
263+
// Check if we already requested node for this remote file
264+
// and return stored promise if we did.
265+
if (processingCache[url]) {
266+
return processingCache[url]
267+
}
268+
269+
270+
if (!url || isWebUri(url) === undefined) {
271+
// should we resolve here, or reject?
272+
// Technically, it's invalid input
273+
return Promise.resolve()
274+
}
275+
276+
return (processingCache[url] = pushTask({ url, store, cache, createNode, auth }))
114277
}

0 commit comments

Comments
 (0)