在做爬虫抓取网站数据时,通常可能会遇到一个问题,那就是服务器对访问IP做了一定的限制,比如限制单位时间内的访问次数等。这个时候,我们可以使用代理的方式进行解决。
我们可以使用HttpClient提供的方式进行代理设置,官方给出的代码示例如下:
package org.apache.http.examples.client;
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
/**
* How to send a request via proxy.
*
* @since 4.0
*/
public class ClientExecuteProxy {
public static void main(String[] args)throws Exception {
CloseableHttpClient httpclient = HttpClients.createDefault();
try {
HttpHost target = new HttpHost("localhost", 443, "https"); //这个是目标主机,即您要访问的服务器,比较网站。
HttpHost proxy = new HttpHost("127.0.0.1", 8080, "http"); //这个是代理信息,即我们要通过哪个代理主机进行代理请求访问。
RequestConfig config = RequestConfig.custom()
.setProxy(proxy)
.build();
HttpGet request = new HttpGet("/");
request.setConfig(config);
System.out.println("Executing request " + request.getRequestLine() + " to " + target + " via " + proxy);
CloseableHttpResponse response = httpclient.execute(target, request); //执行请求。
try {
System.out.println("----------------------------------------");
System.out.println(response.getStatusLine()); //打印状态信息
EntityUtils.consume(response.getEntity());
} finally {
response.close();
}
} finally {
httpclient.close();
}
}
}