blob: 8c3579b172db663ac56bc214e1505846a30a0ae3 [file] [log] [blame]
Rahul Ravikumar82028732019-04-23 18:11:25 -07001/*
2 * Copyright 2019 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17import { Request, Response } from 'express';
18import puppeteer = require('puppeteer');
19import { log } from './logger';
20import { ContentNode } from './types';
21import { PlainTextFormatter } from './plain_text_formatter';
22
Rahul Ravikumar92dc5cd2022-02-22 10:04:42 -080023const CHROME_LAUNCH_ARGS = ['--enable-dom-distiller'];
Rahul Ravikumar82028732019-04-23 18:11:25 -070024
25// A list of DOM Node types that are usually not useful in the context
26// of fetching text content from the page.
27type BannedNames = {
28 [key: string]: true
29};
30
31/**
32 * Handles the actual license request.
33 */
34export async function handleRequest(request: Request, response: Response) {
35 const url = request.body.url;
36 if (url) {
37 try {
38 log(`Handling license request for ${url}`);
Rahul Ravikumar23596b32022-02-22 10:36:55 -080039 if (!isValidProtocol(url)) {
40 response.status(400).send('Invalid request.');
41 return;
42 }
43
Rahul Ravikumar82028732019-04-23 18:11:25 -070044 const nodes = await handleLicenseRequest(url);
45 const content = PlainTextFormatter.plainTextFor(nodes);
46 response.status(200).send(content);
47 } catch (error) {
48 log('Error handling license request ', error);
49 response.status(400).send('Something bad happened. Check the logs');
50 }
51 } else {
52 response.status(400).send('URL required');
53 }
54}
55
Rahul Ravikumar23596b32022-02-22 10:36:55 -080056/**
57 * Validates the protocol. Only allows `https?` requests.
58 * @param requestUrl The request url
59 * @return `true` if the protocol is valid.
60 */
61function isValidProtocol(requestUrl: string): boolean {
62 const url = new URL(requestUrl);
Rahul Ravikumardb23f1d2022-03-03 19:53:54 -080063 if (url.protocol === 'https:') {
64 // Allow https requests
Rahul Ravikumar23596b32022-02-22 10:36:55 -080065 return true;
Rahul Ravikumardb23f1d2022-03-03 19:53:54 -080066 } else if (url.protocol === 'http:') {
67 // Allow http requests
Rahul Ravikumar23596b32022-02-22 10:36:55 -080068 return true;
69 } else {
Rahul Ravikumardb23f1d2022-03-03 19:53:54 -080070 log(`Invalid protocol ${url.protocol}`);
Rahul Ravikumar23596b32022-02-22 10:36:55 -080071 return false;
72 }
73}
74
Rahul Ravikumard152e062023-03-28 14:36:44 -070075async function handleLicenseRequest(url: string, enableLocalDebugging: boolean = false): Promise<ContentNode[]> {
76 const browser = await puppeteer.launch({ args: CHROME_LAUNCH_ARGS, devtools: enableLocalDebugging });
Rahul Ravikumar82028732019-04-23 18:11:25 -070077 const page = await browser.newPage();
Rahul Ravikumard152e062023-03-28 14:36:44 -070078 if (enableLocalDebugging) {
79 page.on('console', (message) => {
80 log(`Puppeteer: ${message.text()}`);
81 });
82 }
Rahul Ravikumar82028732019-04-23 18:11:25 -070083 await page.goto(url, { waitUntil: 'domcontentloaded' });
84 const content = await page.evaluate(() => {
85 // A map of banned nodes
86 const BANNED_LOCAL_NAMES: BannedNames = {
Rahul Ravikumar82028732019-04-23 18:11:25 -070087 'button': true,
88 'canvas': true,
89 'footer': true,
90 'header': true,
91 'code': true,
92 'img': true,
93 'nav': true,
94 'script': true,
95 'style': true,
96 'svg': true,
97 };
98
99 // node list handler
100 function contentForNodeList(list: NodeList | null | undefined): ContentNode[] {
101 const contentNodes: ContentNode[] = [];
102 if (!list) {
103 return contentNodes;
104 }
105
106 for (let i = 0; i < list.length; i += 1) {
107 const node = contentForNode(list.item(i));
108 if (node) {
109 contentNodes.push(node);
110 }
111 }
112 return contentNodes;
113 }
114
115 // content handler
116 const contentWithPath = function (node: ContentNode, accumulator: ContentNode[]) {
117 if (node.textContent && node.textContent.length > 0) {
118 accumulator.push({ localName: node.localName, textContent: node.textContent });
119 }
120 if (node.children) {
121 for (let i = 0; i < node.children.length; i += 1) {
122 contentWithPath(node.children[i], accumulator);
123 }
124 }
125 };
126
127 // node handler
128 function contentForNode(node: Node | null | undefined) {
129 if (!node) {
130 return null;
131 }
132
133 const name = node.nodeName.toLowerCase();
134 // Check if node is banned.
135 if (name && BANNED_LOCAL_NAMES[name] === true) {
136 return null;
137 }
138 // Shallow clone node, as we are only interested in the textContent
139 // of the node, and not the child nodes.
140 const cloned = node.cloneNode();
141 const localName = name;
Rahul Ravikumard152e062023-03-28 14:36:44 -0700142 // Handle elements of different types
143 if (cloned instanceof HTMLAnchorElement) {
144 // anchor element
145 // Ensure that it has reasonable href content
146 const href = cloned.href;
147 if (href.length <= 0 || href === '#') {
148 return null;
149 }
150 }
Rahul Ravikumar82028732019-04-23 18:11:25 -0700151 const textContent = cloned.textContent;
152 const children = contentForNodeList(node.childNodes);
153 return {
154 localName: localName,
155 textContent: textContent,
156 children: children
157 };
158 }
159 const body = document.querySelector('body');
160 const nodes: ContentNode[] =
161 body == null ? [] : contentForNodeList(body.childNodes);
162
163 // Accumulate nodes with content
164 const accumulator: ContentNode[] = [];
165 for (let i = 0; i < nodes.length; i += 1) {
166 const node = nodes[i];
167 contentWithPath(node, accumulator);
168 }
169 return accumulator;
170 });
171 await browser.close();
172 return content;
173}