diff --git a/packages/gatsby-transformer-screenshot/.gitignore b/packages/gatsby-transformer-screenshot/.gitignore new file mode 100644 index 0000000000000..f44b511af7ac1 --- /dev/null +++ b/packages/gatsby-transformer-screenshot/.gitignore @@ -0,0 +1,5 @@ +/*.js +!index.js +yarn.lock +lambda-package.zip +lambda-dist diff --git a/packages/gatsby-transformer-screenshot/.npmignore b/packages/gatsby-transformer-screenshot/.npmignore new file mode 100644 index 0000000000000..039ffba2cef82 --- /dev/null +++ b/packages/gatsby-transformer-screenshot/.npmignore @@ -0,0 +1,40 @@ +# Logs +logs +*.log + +# Runtime data +pids +*.pid +*.seed + +# Directory for instrumented libs generated by jscoverage/JSCover +lib-cov + +# Coverage directory used by tools like istanbul +coverage + +# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) +.grunt + +# node-waf configuration +.lock-wscript + +# Compiled binary addons (http://nodejs.org/api/addons.html) +build/Release + +# Dependency directory +# https://www.npmjs.org/doc/misc/npm-faq.html#should-i-check-my-node_modules-folder-into-git +node_modules +*.un~ +yarn.lock +src +flow-typed +coverage +decls +examples + +# Lambda-related +lambda +lambda-dist +chrome +lambda-package.zip diff --git a/packages/gatsby-transformer-screenshot/README.md b/packages/gatsby-transformer-screenshot/README.md new file mode 100644 index 0000000000000..840abb1652988 --- /dev/null +++ b/packages/gatsby-transformer-screenshot/README.md @@ -0,0 +1,67 @@ +# gatsby-transformer-screenshot + +Plugin for creating screenshots of website URLs using an AWS Lambda +Function. This plugin looks for `SitesYaml` nodes with a `url` +property, and creates `Screenshot` child nodes with an `screenshotFile` field. + +[Live demo](https://thatotherperson.github.io/gatsby-screenshot-demo/) +([source](https://github.com/ThatOtherPerson/gatsby-screenshot-demo)) + +Data should be in a yaml file named `sites.yml` and look like: + +```yaml +- url: https://reactjs.org/ + name: React +- url: https://about.sourcegraph.com/ + name: Sourcegraph +- url: https://simply.co.za/ + name: Simply +``` + +## Install + +`npm install gatsby-transformer-screenshot` + +## How to use + +```javascript +// in your gatsby-config.js +module.exports = { + plugins: [`gatsby-transformer-screenshot`], +}; +``` + +## How to query + +You can query for screenshot files as shown below: + +```graphql +{ + allSitesYaml { + edges { + node { + url + childScreenshot { + screenshotFile { + id + } + } + } + } + } +} +``` + +screenshotFile is a PNG file like any other loaded from your filesystem, so you can use this plugin in combination with `gatsby-image`. + +## Lambda setup + +Gatsby provides a hosted screenshot service for you to use; however, you can run the service yourself on AWS Lambda. + +AWS Lambda is a "serverless" computing platform that lets you run code in response to events, without needing to set up a server. This plugin uses a Lambda function to take screenshots and store them in an AWS S3 bucket. + +First, you will need to (create a S3 bucket)[https://docs.aws.amazon.com/AmazonS3/latest/gsg/CreatingABucket.html] for storing screenshots. Once you have done that, create a (Lifecycle Policy)[https://docs.aws.amazon.com/AmazonS3/latest/user-guide/create-lifecycle.html] for the bucket that sets a number of days before files in the bucket expire. Screenshots will be cached until this date. + +To build the Lambda package, run `npm run build-lambda-package` in this directory. A file called `lambda-package.zip` will be generated - upload this as the source of your AWS Lambda. Finally, you will need to set `S3_BUCKET` as an environment variable for the lambda. + +To set up the HTTP interface, you will need to use AWS API Gateway. Create a new API, create a new resource under `/`, select "Configure as proxy resource", and leave all the settings with their defaults. Create a method on the new resource, selecting "Lambda Function Proxy" as the integration type, and fill in the details of your lambda. diff --git a/packages/gatsby-transformer-screenshot/chrome/buildChrome.sh b/packages/gatsby-transformer-screenshot/chrome/buildChrome.sh new file mode 100644 index 0000000000000..e637565f712d5 --- /dev/null +++ b/packages/gatsby-transformer-screenshot/chrome/buildChrome.sh @@ -0,0 +1,35 @@ +# build headless chrome on EC2 +# https://github.com/adieuadieu/serverless-chrome/blob/master/chrome/README.md + +# sudo su + +yum install -y git redhat-lsb python bzip2 tar pkgconfig atk-devel alsa-lib-devel bison binutils brlapi-devel bluez-libs-devel bzip2-devel cairo-devel cups-devel dbus-devel dbus-glib-devel expat-devel fontconfig-devel freetype-devel gcc-c++ GConf2-devel glib2-devel glibc.i686 gperf glib2-devel gtk2-devel gtk3-devel java-1.*.0-openjdk-devel libatomic libcap-devel libffi-devel libgcc.i686 libgnome-keyring-devel libjpeg-devel libstdc++.i686 libX11-devel libXScrnSaver-devel libXtst-devel libxkbcommon-x11-devel ncurses-compat-libs nspr-devel nss-devel pam-devel pango-devel pciutils-devel pulseaudio-libs-devel zlib.i686 httpd mod_ssl php php-cli python-psutil wdiff --enablerepo=epel + +cd ~ +git clone https://chromium.googlesource.com/chromium/tools/depot_tools.git +echo "export PATH=$PATH:$HOME/depot_tools" >> ~/.bash_profile +source ~/.bash_profile + +mkdir Chromium +cd Chromium +fetch --no-history chromium +cd src + +# use /tmp instead of /dev/shm +# https://groups.google.com/a/chromium.org/forum/#!msg/headless-dev/qqbZVZ2IwEw/CPInd55OBgAJ +sed -i -e "s/use_dev_shm = true;/use_dev_shm = false;/g" base/files/file_util_posix.cc + +mkdir -p out/Headless +echo 'import("//build/args/headless.gn")' > out/Headless/args.gn +echo 'is_debug = false' >> out/Headless/args.gn +echo 'symbol_level = 0' >> out/Headless/args.gn +echo 'is_component_build = false' >> out/Headless/args.gn +echo 'remove_webcore_debug_symbols = true' >> out/Headless/args.gn +echo 'enable_nacl = false' >> out/Headless/args.gn +gn gen out/Headless +ninja -C out/Headless headless_shell + +cd out/Headless +tar -zcvf /home/ec2-user/headless_shell.tar.gz headless_shell + +# scp ec2-user@xxx.amazonaws.com:~/headless_shell.tar.gz . diff --git a/packages/gatsby-transformer-screenshot/chrome/headless_shell.tar.gz b/packages/gatsby-transformer-screenshot/chrome/headless_shell.tar.gz new file mode 100644 index 0000000000000..d44f014d1bf0f Binary files /dev/null and b/packages/gatsby-transformer-screenshot/chrome/headless_shell.tar.gz differ diff --git a/packages/gatsby-transformer-screenshot/index.js b/packages/gatsby-transformer-screenshot/index.js new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/packages/gatsby-transformer-screenshot/lambda/.babelrc b/packages/gatsby-transformer-screenshot/lambda/.babelrc new file mode 100644 index 0000000000000..77bea5b86ef9a --- /dev/null +++ b/packages/gatsby-transformer-screenshot/lambda/.babelrc @@ -0,0 +1,10 @@ +{ + "presets": [ + ["env", + { + "targets": { + "node": "6.10" + } + }] + ] +} diff --git a/packages/gatsby-transformer-screenshot/lambda/index.js b/packages/gatsby-transformer-screenshot/lambda/index.js new file mode 100644 index 0000000000000..2f100dcce955a --- /dev/null +++ b/packages/gatsby-transformer-screenshot/lambda/index.js @@ -0,0 +1,174 @@ +const setup = require(`./starter-kit/setup`) + +const crypto = require(`crypto`) + +const AWS = require(`aws-sdk`) +const s3 = new AWS.S3({ + apiVersion: `2006-03-01`, +}) + +exports.handler = async (event, context, callback) => { + // For keeping the browser launch + context.callbackWaitsForEmptyEventLoop = false + + let request = {} + if (event.body) { + request = JSON.parse(event.body) + } + + const url = request.url + + if (!url) { + callback(null, proxyError(`no url provided`)) + return + } + + const width = request.width || 1024 + const height = request.height || 768 + + const browser = await setup.getBrowser() + exports + .run(browser, url, width, height) + .then(result => { + callback(null, proxyResponse(result)) + }) + .catch(err => { + callback(null, proxyError(err)) + }) +} + +exports.run = async (browser, url, width, height) => { + console.log(`Invoked: ${url} (${width}x${height})`) + + if (!process.env.S3_BUCKET) { + throw new Error( + `Provide the S3 bucket to use by adding an S3_BUCKET` + + ` environment variable to this Lambda's configuration` + ) + } + + const region = await s3GetBucketLocation(process.env.S3_BUCKET) + + if (!region) { + throw new Error(`invalid bucket ${process.env.S3_BUCKET}`) + } + + const keyBase = `${url}-(${width},${height})` + const digest = crypto + .createHash(`md5`) + .update(keyBase) + .digest(`hex`) + const key = `${digest}.png` + + const screenshotUrl = `https://s3-${region}.amazonaws.com/${ + process.env.S3_BUCKET + }/${key}` + + const metadata = await s3HeadObject(key) + + const now = new Date() + if (metadata) { + if (metadata.Expiration) { + const expires = getDateFromExpiration(metadata.Expiration) + if (now < expires) { + console.log(`Returning cached screenshot`) + return { url: screenshotUrl, expires } + } + } else { + throw new Error(`no expiration date set`) + } + } + + console.log(`Taking new screenshot`) + + const page = await browser.newPage() + + await page.setViewport({ width, height }) + await page.goto(url, { waitUntil: [`load`, `networkidle0`] }) + + const screenshot = await page.screenshot() + const up = await s3PutObject(key, screenshot) + + await page.close() + + let expires + + if (up && up.Expiration) { + expires = getDateFromExpiration(up.Expiration) + } + + return { url: screenshotUrl, expires } +} + +const proxyResponse = body => { + body.success = true + + return { + statusCode: 200, + body: JSON.stringify(body), + } +} + +const proxyError = err => { + let msg = err + + if (err instanceof Error) { + msg = err.message + } + + return { + statusCode: 400, + body: JSON.stringify({ + success: false, + error: msg, + }), + } +} + +const s3PutObject = async (key, body) => { + const params = { + ACL: `public-read`, + Bucket: process.env.S3_BUCKET, + Key: key, + Body: body, + ContentType: `image/png`, + } + + return new Promise((resolve, reject) => { + s3.putObject(params, (err, data) => { + if (err) reject(err) + else resolve(data) + }) + }) +} + +const s3GetBucketLocation = bucket => { + const params = { + Bucket: bucket, + } + + return new Promise((resolve, reject) => { + s3.getBucketLocation(params, (err, data) => { + if (err) resolve(null) + else resolve(data.LocationConstraint) + }) + }) +} + +const s3HeadObject = key => { + const params = { + Bucket: process.env.S3_BUCKET, + Key: key, + } + + return new Promise((resolve, reject) => { + s3.headObject(params, (err, data) => { + if (err) resolve(null) + else resolve(data) + }) + }) +} + +const expiryPattern = /expiry-date="([^"]*)"/ +const getDateFromExpiration = expiration => + new Date(expiryPattern.exec(expiration)[1]) diff --git a/packages/gatsby-transformer-screenshot/lambda/package.json b/packages/gatsby-transformer-screenshot/lambda/package.json new file mode 100644 index 0000000000000..11bc4fa4cf855 --- /dev/null +++ b/packages/gatsby-transformer-screenshot/lambda/package.json @@ -0,0 +1,9 @@ +{ + "dependencies": { + "puppeteer": "0.10.2", + "tar": "^4.2.0" + }, + "devDependencies": { + "aws-sdk": "^2.181.0" + } +} diff --git a/packages/gatsby-transformer-screenshot/lambda/starter-kit/LICENSE b/packages/gatsby-transformer-screenshot/lambda/starter-kit/LICENSE new file mode 100644 index 0000000000000..00b99c4de76fd --- /dev/null +++ b/packages/gatsby-transformer-screenshot/lambda/starter-kit/LICENSE @@ -0,0 +1,9 @@ +The MIT License (MIT) + +Copyright (c) 2017 Taiki Sakamoto + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/packages/gatsby-transformer-screenshot/lambda/starter-kit/README.md b/packages/gatsby-transformer-screenshot/lambda/starter-kit/README.md new file mode 100644 index 0000000000000..c9b2eab3943da --- /dev/null +++ b/packages/gatsby-transformer-screenshot/lambda/starter-kit/README.md @@ -0,0 +1 @@ +See (puppeteer-lambda-starter-kit)[https://github.com/sambaiz/puppeteer-lambda-starter-kit] for the original, unmodified code used as the base for this Lambda. diff --git a/packages/gatsby-transformer-screenshot/lambda/starter-kit/config.js b/packages/gatsby-transformer-screenshot/lambda/starter-kit/config.js new file mode 100644 index 0000000000000..00e14b83dd420 --- /dev/null +++ b/packages/gatsby-transformer-screenshot/lambda/starter-kit/config.js @@ -0,0 +1,29 @@ +const path = require(`path`) + +const launchOptionForLambda = [ + // error when launch(); No usable sandbox! Update your kernel + `--no-sandbox`, + // error when launch(); Failed to load libosmesa.so + `--disable-gpu`, + // freeze when newPage() + `--single-process`, +] + +const localChromePath = path.join(`headless_shell.tar.gz`) +const remoteChromeS3Bucket = process.env.CHROME_BUCKET +const remoteChromeS3Key = process.env.CHROME_KEY || `headless_shell.tar.gz` + +const setupChromePath = path.join(path.sep, `tmp`) +const executablePath = path.join(setupChromePath, `headless_shell`) + +const DEBUG = process.env.DEBUG + +module.exports = { + launchOptionForLambda, + localChromePath, + remoteChromeS3Bucket, + remoteChromeS3Key, + setupChromePath, + executablePath, + DEBUG, +} diff --git a/packages/gatsby-transformer-screenshot/lambda/starter-kit/setup.js b/packages/gatsby-transformer-screenshot/lambda/starter-kit/setup.js new file mode 100644 index 0000000000000..8c94806f845dc --- /dev/null +++ b/packages/gatsby-transformer-screenshot/lambda/starter-kit/setup.js @@ -0,0 +1,103 @@ +const aws = require(`aws-sdk`) +const s3 = new aws.S3({ apiVersion: `2006-03-01` }) +const fs = require(`fs`) +const tar = require(`tar`) +const puppeteer = require(`puppeteer`) +const config = require(`./config`) + +exports.getBrowser = (() => { + let browser + return async () => { + if (typeof browser === `undefined` || !await isBrowserAvailable(browser)) { + await setupChrome() + browser = await puppeteer.launch({ + headless: true, + executablePath: config.executablePath, + args: config.launchOptionForLambda, + dumpio: !!exports.DEBUG, + ignoreHTTPSErrors: true, + }) + const version = await browser.version() + debugLog(async b => `launch done: ${version}`) + } + return browser + } +})() + +const isBrowserAvailable = async browser => { + try { + await browser.version() + } catch (e) { + debugLog(e) // not opened etc. + return false + } + return true +} + +const setupChrome = async () => { + if (!await existsExecutableChrome()) { + if (await existsLocalChrome()) { + debugLog(`setup local chrome`) + await setupLocalChrome() + } else { + debugLog(`setup s3 chrome`) + await setupS3Chrome() + } + debugLog(`setup done`) + } +} + +const existsLocalChrome = () => + new Promise((resolve, reject) => { + fs.exists(config.localChromePath, exists => { + resolve(exists) + }) + }) + +const existsExecutableChrome = () => + new Promise((resolve, reject) => { + fs.exists(config.executablePath, exists => { + resolve(exists) + }) + }) + +const setupLocalChrome = () => + new Promise((resolve, reject) => { + fs + .createReadStream(config.localChromePath) + .on(`error`, err => reject(err)) + .pipe( + tar.x({ + C: config.setupChromePath, + }) + ) + .on(`error`, err => reject(err)) + .on(`end`, () => resolve()) + }) + +const setupS3Chrome = () => + new Promise((resolve, reject) => { + const params = { + Bucket: config.remoteChromeS3Bucket, + Key: config.remoteChromeS3Key, + } + s3 + .getObject(params) + .createReadStream() + .on(`error`, err => reject(err)) + .pipe( + tar.x({ + C: config.setupChromePath, + }) + ) + .on(`error`, err => reject(err)) + .on(`end`, () => resolve()) + }) + +const debugLog = log => { + if (config.DEBUG) { + let message = log + if (typeof log === `function`) message = log() + Promise.resolve(message).then(message => console.log(message)) + } +} diff --git a/packages/gatsby-transformer-screenshot/package.json b/packages/gatsby-transformer-screenshot/package.json new file mode 100644 index 0000000000000..db40b8375f9c7 --- /dev/null +++ b/packages/gatsby-transformer-screenshot/package.json @@ -0,0 +1,27 @@ +{ + "name": "gatsby-transformer-screenshot", + "version": "1.0.0", + "description": "Gatsby transformer plugin that uses AWS Lambda to take screenshots of websites", + "main": "index.js", + "dependencies": { + "axios": "^0.17.1" + }, + "devDependencies": { + "babel-cli": "^6.26.0", + "cross-env": "^5.1.3" + }, + "scripts": { + "build": "babel src --out-dir . --ignore __tests__", + "watch": "babel -w src --out-dir . --ignore __tests__", + "prepublish": "cross-env NODE_ENV=production npm run build", + "build-lambda-package": "npm run prepare-lambda-package && cp chrome/headless_shell.tar.gz lambda-dist && cd lambda-dist && zip -rq ../lambda-package.zip .", + "prepare-lambda-package": "babel lambda --out-dir lambda-dist && cp lambda/package.json lambda-dist/package.json && cd lambda-dist && PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=1 npm install --production" + }, + "keywords": [ + "gatsby", + "gatsby-plugin", + "screenshot" + ], + "author": "David Beckley ", + "license": "MIT" +} diff --git a/packages/gatsby-transformer-screenshot/src/gatsby-node.js b/packages/gatsby-transformer-screenshot/src/gatsby-node.js new file mode 100644 index 0000000000000..8a2b9b77015d1 --- /dev/null +++ b/packages/gatsby-transformer-screenshot/src/gatsby-node.js @@ -0,0 +1,99 @@ +const crypto = require(`crypto`) +const axios = require(`axios`) +const _ = require(`lodash`) +const { createRemoteFileNode } = require(`gatsby-source-filesystem`) + +const SCREENSHOT_ENDPOINT = `https://h7iqvn4842.execute-api.us-east-2.amazonaws.com/prod/screenshot` + +const createContentDigest = obj => + crypto + .createHash(`md5`) + .update(JSON.stringify(obj)) + .digest(`hex`) + +exports.onPreBootstrap = ( + { store, cache, boundActionCreators }, + pluginOptions +) => { + const { createNode, touchNode } = boundActionCreators + + // Check for updated screenshots + // and prevent Gatsby from garbage collecting remote file nodes + return Promise.all( + _.values(store.getState().nodes) + .filter(n => n.internal.type === `Screenshot`) + .map(async n => { + if (n.expires && new Date() >= new Date(n.expires)) { + // Screenshot expired, re-run Lambda + await createScreenshotNode({ + url: n.url, + parent: n.parent, + store, + cache, + createNode, + }) + } else { + // Screenshot hasn't yet expired, touch the image node + // to prevent garbage collection + touchNode(n.screenshotFile___NODE) + } + }) + ) +} + +exports.onCreateNode = async ({ node, boundActionCreators, store, cache }) => { + const { createNode, createParentChildLink } = boundActionCreators + + // We only care about parsed sites.yaml files + if (node.internal.type !== `SitesYaml`) { + return + } + + const screenshotNode = await createScreenshotNode({ + url: node.url, + parent: node.id, + store, + cache, + createNode, + }) + + createParentChildLink({ + parent: node, + child: screenshotNode, + }) +} + +const createScreenshotNode = async ({ + url, + parent, + store, + cache, + createNode, +}) => { + const screenshotResponse = await axios.post(SCREENSHOT_ENDPOINT, { url }) + + const fileNode = await createRemoteFileNode({ + url: screenshotResponse.data.url, + store, + cache, + createNode, + }) + + const screenshotNode = { + id: `${parent} >>> Screenshot`, + url, + expires: screenshotResponse.data.expires, + parent, + children: [], + internal: { + type: `Screenshot`, + }, + screenshotFile___NODE: fileNode.id, + } + + screenshotNode.internal.contentDigest = createContentDigest(screenshotNode) + + createNode(screenshotNode) + + return screenshotNode +}