blob: ec801aa44b78b37b56ff17b74de7371f4f25e30f [file] [log] [blame]
Rahul Ravikumar82028732019-04-23 18:11:25 -07001/*
2 * Copyright 2019 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17import { Request, Response } from 'express';
18import puppeteer = require('puppeteer');
19import { log } from './logger';
20import { ContentNode } from './types';
21import { PlainTextFormatter } from './plain_text_formatter';
Rahul Ravikumare8c45eb2023-11-13 16:44:53 -080022import { transformUrl } from './url-transforms';
Rahul Ravikumar82028732019-04-23 18:11:25 -070023
Rahul Ravikumar92dc5cd2022-02-22 10:04:42 -080024const CHROME_LAUNCH_ARGS = ['--enable-dom-distiller'];
Rahul Ravikumar82028732019-04-23 18:11:25 -070025
26// A list of DOM Node types that are usually not useful in the context
27// of fetching text content from the page.
28type BannedNames = {
29 [key: string]: true
30};
31
32/**
33 * Handles the actual license request.
34 */
35export async function handleRequest(request: Request, response: Response) {
36 const url = request.body.url;
37 if (url) {
38 try {
39 log(`Handling license request for ${url}`);
Rahul Ravikumar23596b32022-02-22 10:36:55 -080040 if (!isValidProtocol(url)) {
41 response.status(400).send('Invalid request.');
42 return;
43 }
44
Rahul Ravikumar82028732019-04-23 18:11:25 -070045 const nodes = await handleLicenseRequest(url);
46 const content = PlainTextFormatter.plainTextFor(nodes);
47 response.status(200).send(content);
48 } catch (error) {
49 log('Error handling license request ', error);
50 response.status(400).send('Something bad happened. Check the logs');
51 }
52 } else {
53 response.status(400).send('URL required');
54 }
55}
56
Rahul Ravikumar23596b32022-02-22 10:36:55 -080057/**
58 * Validates the protocol. Only allows `https?` requests.
59 * @param requestUrl The request url
60 * @return `true` if the protocol is valid.
61 */
62function isValidProtocol(requestUrl: string): boolean {
63 const url = new URL(requestUrl);
Rahul Ravikumardb23f1d2022-03-03 19:53:54 -080064 if (url.protocol === 'https:') {
65 // Allow https requests
Rahul Ravikumar23596b32022-02-22 10:36:55 -080066 return true;
Rahul Ravikumardb23f1d2022-03-03 19:53:54 -080067 } else if (url.protocol === 'http:') {
68 // Allow http requests
Rahul Ravikumar23596b32022-02-22 10:36:55 -080069 return true;
70 } else {
Rahul Ravikumardb23f1d2022-03-03 19:53:54 -080071 log(`Invalid protocol ${url.protocol}`);
Rahul Ravikumar23596b32022-02-22 10:36:55 -080072 return false;
73 }
74}
75
Rahul Ravikumard152e062023-03-28 14:36:44 -070076async function handleLicenseRequest(url: string, enableLocalDebugging: boolean = false): Promise<ContentNode[]> {
Rahul Ravikumare8c45eb2023-11-13 16:44:53 -080077 const transformed = transformUrl(url);
78 if (url !== transformed) {
79 log(`Transformed request url to ${transformed}`);
80 }
81 const browser = await puppeteer.launch({
82 args: CHROME_LAUNCH_ARGS,
83 devtools: enableLocalDebugging,
84 // https://developer.chrome.com/articles/new-headless/
Rahul Ravikumar515b5152024-11-07 12:37:31 -080085 headless: true
Rahul Ravikumare8c45eb2023-11-13 16:44:53 -080086 });
Rahul Ravikumar82028732019-04-23 18:11:25 -070087 const page = await browser.newPage();
Rahul Ravikumard152e062023-03-28 14:36:44 -070088 if (enableLocalDebugging) {
89 page.on('console', (message) => {
90 log(`Puppeteer: ${message.text()}`);
91 });
92 }
Rahul Ravikumare8c45eb2023-11-13 16:44:53 -080093 await page.goto(transformed, { waitUntil: 'domcontentloaded' });
Rahul Ravikumar82028732019-04-23 18:11:25 -070094 const content = await page.evaluate(() => {
95 // A map of banned nodes
96 const BANNED_LOCAL_NAMES: BannedNames = {
Rahul Ravikumar82028732019-04-23 18:11:25 -070097 'button': true,
98 'canvas': true,
99 'footer': true,
100 'header': true,
101 'code': true,
102 'img': true,
103 'nav': true,
104 'script': true,
105 'style': true,
106 'svg': true,
107 };
108
109 // node list handler
110 function contentForNodeList(list: NodeList | null | undefined): ContentNode[] {
111 const contentNodes: ContentNode[] = [];
112 if (!list) {
113 return contentNodes;
114 }
115
116 for (let i = 0; i < list.length; i += 1) {
117 const node = contentForNode(list.item(i));
118 if (node) {
119 contentNodes.push(node);
120 }
121 }
122 return contentNodes;
123 }
124
125 // content handler
126 const contentWithPath = function (node: ContentNode, accumulator: ContentNode[]) {
127 if (node.textContent && node.textContent.length > 0) {
128 accumulator.push({ localName: node.localName, textContent: node.textContent });
129 }
130 if (node.children) {
131 for (let i = 0; i < node.children.length; i += 1) {
132 contentWithPath(node.children[i], accumulator);
133 }
134 }
135 };
136
137 // node handler
138 function contentForNode(node: Node | null | undefined) {
139 if (!node) {
140 return null;
141 }
142
143 const name = node.nodeName.toLowerCase();
144 // Check if node is banned.
145 if (name && BANNED_LOCAL_NAMES[name] === true) {
146 return null;
147 }
148 // Shallow clone node, as we are only interested in the textContent
149 // of the node, and not the child nodes.
150 const cloned = node.cloneNode();
151 const localName = name;
Rahul Ravikumard152e062023-03-28 14:36:44 -0700152 // Handle elements of different types
153 if (cloned instanceof HTMLAnchorElement) {
154 // anchor element
155 // Ensure that it has reasonable href content
156 const href = cloned.href;
157 if (href.length <= 0 || href === '#') {
158 return null;
159 }
160 }
Rahul Ravikumar82028732019-04-23 18:11:25 -0700161 const textContent = cloned.textContent;
162 const children = contentForNodeList(node.childNodes);
163 return {
164 localName: localName,
165 textContent: textContent,
166 children: children
167 };
168 }
169 const body = document.querySelector('body');
170 const nodes: ContentNode[] =
171 body == null ? [] : contentForNodeList(body.childNodes);
172
173 // Accumulate nodes with content
174 const accumulator: ContentNode[] = [];
175 for (let i = 0; i < nodes.length; i += 1) {
176 const node = nodes[i];
177 contentWithPath(node, accumulator);
178 }
179 return accumulator;
180 });
181 await browser.close();
182 return content;
183}