Skip to content
2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,11 @@
"fetch-retry": "6.0.0",
"file-type": "21.1.1",
"hast-util-from-html": "2.0.3",
"hast-util-to-html": "9.0.5",
"jose": "6.1.2",
"lodash.escape": "4.0.1",
"private-ip": "3.0.2",
"unist-util-visit": "5.0.0",
"xml2js": "0.6.2"
},
"devDependencies": {
Expand Down
2 changes: 1 addition & 1 deletion src/source/post.js
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ export async function postSource(context, info) {

try {
const mime = contentTypeFromExtension(info.ext);
const body = await getValidPayload(context, info, mime);
const body = await getValidPayload(context, info, mime, true);

// TODO store images HTML from the outside in the media bus

Expand Down
169 changes: 155 additions & 14 deletions src/source/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@
* governing permissions and limitations under the License.
*/

import { MediaHandler } from '@adobe/helix-mediahandler';
import processQueue from '@adobe/helix-shared-process-queue';
import { fromHtml } from 'hast-util-from-html';
import { toHtml } from 'hast-util-to-html';
import { visit, CONTINUE } from 'unist-util-visit';
import { MEDIA_TYPES } from '../media/validate.js';
import { StatusCodeError } from '../support/StatusCodeError.js';

Expand All @@ -37,6 +41,16 @@ export const CONTENT_TYPES = {
'.svg': 'image/svg+xml',
};

/**
* Default maximum image size for the media bus.
*/
const DEFAULT_MAX_IMAGE_SIZE = 20 * 1024 * 1024; // 20mb

/**
* Default maximum number of images for the media bus.
*/
const DEFAULT_MAX_IMAGES = 200;

/**
* Error messages from the media validation often start with this prefix.
*/
Expand All @@ -59,6 +73,27 @@ export function contentTypeFromExtension(ext) {
throw e;
}

/**
* Get the HAST from the body.
*
* @param {Buffer} body the message body as buffer
* @return {Hast} the HAST
* @throws {StatusCodeError} with statusCode 400 if the HTML is invalid
*/
function getHast(body) {
function validateHtmlError(message) {
const msg = `${message.message} - ${message.note}`;
if (ACCEPTABLE_HTML_ERRORS.includes(message.ruleId)) {
return;
}
throw new StatusCodeError(msg, 400);
}

return fromHtml(body.toString(), {
onerror: validateHtmlError,
});
}

/**
* Get the S3 key from the organization, site, and path.
*
Expand All @@ -83,26 +118,94 @@ export function getSourceKey(info) {
}

/**
* Validate the HTML message body stored in the request info.
* Validate the HTML message body and intern the images if a media handler is provided.
* When interning the images, they are uploaded to the media bus and references to them
* are replaced with media bus URLs.
*
* @param {import('../support/AdminContext').AdminContext} context context
* @param {Buffer} body the message body as buffer
* @param {string[]} keptImageURLPrefixes prefixes of image URLs to keep
* @param {MediaHandler} mediaHandler media handler. If provided, external images are
* interned. If not provided, the HTML is not considered valid if it contains external
* images.
* @returns {Promise<Buffer>} the message body either as a buffer or a string,
* potentially altered with links to the interned images.
* @throws {StatusCodeError} with statusCode 400 if the HTML is invalid, does not contain
* a body element, or contains external images and a media handler is not provided. Also
* if the HTML contains too many images an error is thrown.
*/
export async function validateHtml(context, body) {
function validateHtmlError(message) {
const msg = `${message.message} - ${message.note}`;
if (ACCEPTABLE_HTML_ERRORS.includes(message.ruleId)) {
context.log.warn(`Ignoring HTML error: ${msg}`);
export async function getValidHtml(context, body, keptImageURLPrefixes, mediaHandler) {
// TODO Check HTML size limit

/* The register() function populates the images map with the nodes that need to be
interned. It is called for each img and picture->source element in the HTML. */
const images = new Map();
function register(node, propName) {
const url = node.properties[propName] || '';
const keepImageURL = keptImageURLPrefixes.some((prefix) => {
if (typeof prefix === 'string') {
return url.startsWith(prefix);
}
// it's a regex
return prefix.test(url);
});

if (keepImageURL) {
return;
}
throw new StatusCodeError(msg, 400);

if (images.has(url)) {
images.get(url).push({ node, propName });
} else {
images.set(url, [{ node, propName }]);
}
}

// TODO Check HTML size limit
let bodyNode = null;
const hast = getHast(body);
visit(hast, 'element', (node) => {
if (node.tagName === 'body') {
bodyNode = node;
}

fromHtml(body.toString(), {
onerror: validateHtmlError,
if (node.tagName === 'img') {
register(node, 'src');
}
if (node.tagName === 'picture') {
const sources = node.children.filter((child) => child.tagName === 'source');
sources.forEach((s) => register(s, 'srcSet')); // note Hast converts srcset to srcSet
}
return CONTINUE;
});

if (!mediaHandler) {
// If the media handler is not provided, we validate only and need to reject external images
if (images.size > 0) {
throw new StatusCodeError('External images are not allowed, use POST to intern them', 400);
}
return body;
}

if (images.size > DEFAULT_MAX_IMAGES) {
throw new StatusCodeError(`Too many images: ${images.size}`, 400);
}

await processQueue(images.entries(), async ([url, nodes]) => {
try {
const blob = await mediaHandler.getBlob(url);
nodes.forEach((n) => {
// eslint-disable-next-line no-param-reassign
n.node.properties[n.propName] = blob.uri || 'about:error';
});
} catch (e) {
context.log.error(`Error getting blob for image: ${url}`, e);
throw new StatusCodeError(`Error getting blob for image: ${url}`, 400);
}
});

/* Only return the body element, note that Hast synthesizes this if it wasn't
present in the input HTML. */
return toHtml(bodyNode);
}

/**
Expand Down Expand Up @@ -148,21 +251,59 @@ export async function validateMedia(context, info, mime, body) {
}
}

function getMediaHandler(ctx, info) {
const noCache = false;
const { log } = ctx;

return new MediaHandler({
bucketId: ctx.attributes.bucketMap.media,
owner: info.org,
repo: info.site,
ref: 'main',
contentBusId: ctx.attributes.config.content.contentBusId,
log,
noCache,
fetchTimeout: 5000, // limit image fetches to 5s
forceHttp1: true,
maxSize: DEFAULT_MAX_IMAGE_SIZE,
});
}

/** Get the prefixes of image URLs to keep.
*
* @param {import('../support/RequestInfo').RequestInfo} info request info
* @returns {string[]} the prefixes of image URLs to keep, either as string or regex
*/
function getKeptImageURLPrefixes(info) {
return [
`https://main--${info.site}--${info.org}.aem.page/`,
`https://main--${info.site}--${info.org}.aem.live/`,

// Allow any host for Dynamic Media Delivery URLs
/^https:\/\/[^/]+\/adobe\/dynamicmedia\/deliver\//,
];
}

/**
* Validate the body stored in the request info.
*
* @param {import('../support/AdminContext').AdminContext} context context
* @param {import('../support/RequestInfo').RequestInfo} info request info
* @param {string} mime media type
* @returns {Promise<Buffer>} body the message body as buffer
* @returns {Promise<Buffer>} body the message body as buffer or string
*/
export async function getValidPayload(context, info, mime) {
export async function getValidPayload(context, info, mime, internImages) {
const body = await info.buffer();

switch (mime) {
case 'text/html':
await validateHtml(context, body);
break;
// This may change the HTML (interning the images) so return its result
return getValidHtml(
context,
body,
getKeptImageURLPrefixes(info),
internImages ? getMediaHandler(context, info) : null,
);
case 'application/json':
await validateJson(context, body);
break;
Expand Down
56 changes: 53 additions & 3 deletions test/source/post.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import { promisify } from 'util';
import zlib from 'zlib';
import { postSource } from '../../src/source/post.js';
import { createInfo, Nock } from '../utils.js';
import { setupContext } from './testutils.js';
import { setupContext, stripSpaces } from './testutils.js';

const gunzip = promisify(zlib.gunzip);

Expand All @@ -36,7 +36,7 @@ describe('Source POST Tests', () => {
it('test postSource HTML', async () => {
async function postFn(_uri, gzipBody) {
const b = await gunzip(Buffer.from(gzipBody, 'hex'));
assert.equal(b.toString(), '<html><body>Hello</body></html>');
assert.equal(b.toString(), '<body>Hello</body>');
}

nock.source()
Expand All @@ -53,12 +53,62 @@ describe('Source POST Tests', () => {
assert.equal(resp.status, 201);
});

it('test postSource HTML with images', async () => {
const imageHash = '1df1eef4cd16906957aa9d03ef3e2623e2bebecc2';

/* The image form example.com should be interned, but the other ones should be
left alone as they are in the list of kept image URLs. */
const htmlIn = `
<body>
<img src="https://example.com/image.jpg">
<img src="https://main--rest--test.aem.page/img1.jpg">
<img src="https://main--rest--test.aem.live/img2.jpg">
<img src="https://my.adobe.com/adobe/dynamicmedia/deliver/img3.jpg">
</body>`;

const htmlOut = `
<body>
<img src="https://main--rest--test.aem.page/media_${imageHash}.jpg">
<img src="https://main--rest--test.aem.page/img1.jpg">
<img src="https://main--rest--test.aem.live/img2.jpg">
<img src="https://my.adobe.com/adobe/dynamicmedia/deliver/img3.jpg">
</body>`;

function imgPutFn(url, body) {
assert.equal(body, 'someimg');
}

async function htmlPutFn(url, gzipBody) {
const b = await gunzip(Buffer.from(gzipBody, 'hex'));
assert.equal(
stripSpaces(b.toString()),
stripSpaces(htmlOut),
'Should have interned the images to media bus',
);
}

const scope = new Nock()('https://example.com');
scope.get('/image.jpg').reply(200, 'someimg');

nock.media().headObject(`/${imageHash}`).reply(404); // report it not found
nock.media().putObject(`/${imageHash}`).reply(201, imgPutFn);
nock.source().putObject('/test/rest/toast/jam.html').reply(201, htmlPutFn);

const resp = await postSource(setupContext(), createInfo(
'/test/sites/rest/source/toast/jam.html',
{},
'POST',
htmlIn,
));
assert.equal(resp.status, 201);
});

it('test postSource invalid HTML', async () => {
const resp = await postSource(setupContext(), createInfo(
'/test/sites/rest/source/toast/jam.html',
{},
'POST',
'<html><body>Hello</bod',
'<body>Hello</bod',
));
assert.equal(resp.status, 400);
});
Expand Down
28 changes: 25 additions & 3 deletions test/source/put.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,22 @@ describe('Source PUT Tests', () => {
});

it('test putSource HTML with user', async () => {
const html = `
<body>
Hello
<img src="https://main--best--tst.aem.live/my-image.jpg">
</body>`;

async function putFn(_uri, gzipBody) {
const b = await gunzip(Buffer.from(gzipBody, 'hex'));
assert.equal(b.toString(), '<html><body>Hello</body></html>');
assert.equal(b.toString(), html);
}

nock.source()
.putObject('/tst/best/toast/jam.html')
.matchHeader('content-type', 'text/html')
.matchHeader('x-amz-meta-last-modified-by', '[email protected]')
.matchHeader('x-amz-meta-uncompressed-length', '31')
.matchHeader('x-amz-meta-uncompressed-length', '107')
.reply(201, putFn);

const path = '/tst/sites/best/source/toast/jam.html';
Expand All @@ -60,11 +66,27 @@ describe('Source PUT Tests', () => {

const resp = await putSource(
context,
createInfo(path, {}, 'PUT', '<html><body>Hello</body></html>'),
createInfo(path, {}, 'PUT', html),
);
assert.equal(resp.status, 201);
});

it('test putSource HTML with external images is rejected', async () => {
const html = `
<body>
Hello
<img src="https://main--somesite--someorg.aem.live/myimg.jpeg">
</body>`;

const path = '/myorg/sites/mysite/source/my-page.html';
const resp = await putSource(
setupContext(path),
createInfo(path, {}, 'PUT', html),
);
assert.equal(resp.status, 400);
assert.match(resp.headers.get('x-error'), /External images are not allowed, use POST to intern them/);
});

it('test putSource JSON', async () => {
function putFn(_uri, body) {
assert.deepStrictEqual(body, { something: 'else' });
Expand Down
4 changes: 4 additions & 0 deletions test/source/testutils.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,7 @@ export function setupContext(suffix, { attributes = {} } = {}) {
},
});
}

export function stripSpaces(str) {
return str.replace(/\s+/g, '');
}
Loading