org.jsoup
jsoup
${latest.version}
代码并不多,通过URL地址获取HTML文本内容,并进行解析。
public class JsoupTest {
public static void main(String[] args) throws IOException {
// 要爬取的网址url链接列表
List list = new ArrayList<>();
for (int i = 0; i <= 225; i += 25) {
String url = “https://movie.douban.com/top250?start=” + i + “&filter=”;
list.add(url);
}
// 遍历url列表,爬取网页数据
for (String urlStr : list) {
Document doc = Jsoup.connect(urlStr)
.maxBodySize(Integer.MAX_VALUE)
.userAgent(“Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36”)
.timeout(6000)
.get();
Element co