blob: 9569c5c47710d00e2ed5befdaec63f5115482233 [file] [log] [blame]
Rahul Ravikumar82028732019-04-23 18:11:25 -07001/*
2 * Copyright 2019 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17import { Request, Response } from 'express';
18import puppeteer = require('puppeteer');
19import { log } from './logger';
20import { ContentNode } from './types';
21import { PlainTextFormatter } from './plain_text_formatter';
22
Rahul Ravikumar92dc5cd2022-02-22 10:04:42 -080023const CHROME_LAUNCH_ARGS = ['--enable-dom-distiller'];
Rahul Ravikumar82028732019-04-23 18:11:25 -070024
25// A list of DOM Node types that are usually not useful in the context
26// of fetching text content from the page.
27type BannedNames = {
28 [key: string]: true
29};
30
31/**
32 * Handles the actual license request.
33 */
34export async function handleRequest(request: Request, response: Response) {
35 const url = request.body.url;
36 if (url) {
37 try {
38 log(`Handling license request for ${url}`);
39 const nodes = await handleLicenseRequest(url);
40 const content = PlainTextFormatter.plainTextFor(nodes);
41 response.status(200).send(content);
42 } catch (error) {
43 log('Error handling license request ', error);
44 response.status(400).send('Something bad happened. Check the logs');
45 }
46 } else {
47 response.status(400).send('URL required');
48 }
49}
50
51async function handleLicenseRequest(url: string): Promise<ContentNode[]> {
52 const browser = await puppeteer.launch({ args: CHROME_LAUNCH_ARGS });
53 const page = await browser.newPage();
54 await page.goto(url, { waitUntil: 'domcontentloaded' });
55 const content = await page.evaluate(() => {
56 // A map of banned nodes
57 const BANNED_LOCAL_NAMES: BannedNames = {
58 'a': true,
59 'button': true,
60 'canvas': true,
61 'footer': true,
62 'header': true,
63 'code': true,
64 'img': true,
65 'nav': true,
66 'script': true,
67 'style': true,
68 'svg': true,
69 };
70
71 // node list handler
72 function contentForNodeList(list: NodeList | null | undefined): ContentNode[] {
73 const contentNodes: ContentNode[] = [];
74 if (!list) {
75 return contentNodes;
76 }
77
78 for (let i = 0; i < list.length; i += 1) {
79 const node = contentForNode(list.item(i));
80 if (node) {
81 contentNodes.push(node);
82 }
83 }
84 return contentNodes;
85 }
86
87 // content handler
88 const contentWithPath = function (node: ContentNode, accumulator: ContentNode[]) {
89 if (node.textContent && node.textContent.length > 0) {
90 accumulator.push({ localName: node.localName, textContent: node.textContent });
91 }
92 if (node.children) {
93 for (let i = 0; i < node.children.length; i += 1) {
94 contentWithPath(node.children[i], accumulator);
95 }
96 }
97 };
98
99 // node handler
100 function contentForNode(node: Node | null | undefined) {
101 if (!node) {
102 return null;
103 }
104
105 const name = node.nodeName.toLowerCase();
106 // Check if node is banned.
107 if (name && BANNED_LOCAL_NAMES[name] === true) {
108 return null;
109 }
110 // Shallow clone node, as we are only interested in the textContent
111 // of the node, and not the child nodes.
112 const cloned = node.cloneNode();
113 const localName = name;
114 const textContent = cloned.textContent;
115 const children = contentForNodeList(node.childNodes);
116 return {
117 localName: localName,
118 textContent: textContent,
119 children: children
120 };
121 }
122 const body = document.querySelector('body');
123 const nodes: ContentNode[] =
124 body == null ? [] : contentForNodeList(body.childNodes);
125
126 // Accumulate nodes with content
127 const accumulator: ContentNode[] = [];
128 for (let i = 0; i < nodes.length; i += 1) {
129 const node = nodes[i];
130 contentWithPath(node, accumulator);
131 }
132 return accumulator;
133 });
134 await browser.close();
135 return content;
136}