时间:2023-1-20 作者:老大夫 分类: 传智JAVA爬虫学习笔记
老教程已经进不去京东了,因为直接爬取会被拦截到京东登录页.
我们需要进行请求头伪装.
package cn.itcast.jd.util;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Component;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.UUID;
@Component
public class HttpUtils {
//连接池管理器
private PoolingHttpClientConnectionManager cm;
//连接池管理器构造方法
public HttpUtils() {
this.cm = new PoolingHttpClientConnectionManager();
//设置最大连接数
this.cm.setMaxTotal(100);
//设置每个主机的最大连接数
this.cm.setDefaultMaxPerRoute(10);
}
/**
* 根据请求地址下载页面数据
* @param url
* @return 页面数据
*/
public String doGetHtml(String url){
//获取httpClient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
//设置HttpGet请求对象,设置url地址
HttpGet httpGet= new HttpGet(url);
//设置请求信息
httpGet.setConfig(this.getConfig());
//伪装请求头 ,这里信息太多我就不贴了
httpGet.setHeader("a123123d.com");
httpGet.setHeader("123123123");
httpGet.setHeader("1231231230.7,213123126");
httpGet.setHeader("1232113123");
httpGet.setHeader("c123123D4Q");
httpGet.setHeader("123123123/");
httpGet.setHeader("123123123213"");
httpGet.setHeader("123123123");
httpGet.setHeader("12321312"");
httpGet.setHeader("123123123");
httpGet.setHeader("12312312");
httpGet.setHeader("123123213");
httpGet.setHeader("12312312");
httpGet.setHeader("12312312");
httpGet.setHeader("123123123");
CloseableHttpResponse response=null;
try {
//使用httpClient发起请求,获取响应
response = httpClient.execute(httpGet);
//解析响应返回结果
if(response.getStatusLine().getStatusCode() == 200){
//判断响应体是否不为空,如果不为空就可以使用EntityUtils
if(response.getEntity() != null){
String content = EntityUtils.toString(response.getEntity(), "utf8");
return content;
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if(response != null){
try {
response.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
//返回空串
return "";
}
/**
* 下载图片
* @param url
* @return 图片名称
*/
public String doGetImage(String url){
//获取httpClient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
//设置HttpGet请求对象,设置url地址
HttpGet httpGet= new HttpGet(url);
//设置请求信息
httpGet.setConfig(this.getConfig());
//伪装请求头 ,这里信息太多我就不贴了
httpGet.setHeader("a123123d.com");
httpGet.setHeader("123123123");
httpGet.setHeader("1231231230.7,213123126");
httpGet.setHeader("1232113123");
httpGet.setHeader("c123123D4Q");
httpGet.setHeader("123123123/");
httpGet.setHeader("123123123213"");
httpGet.setHeader("123123123");
httpGet.setHeader("12321312"");
httpGet.setHeader("123123123");
httpGet.setHeader("12312312");
httpGet.setHeader("123123213");
httpGet.setHeader("12312312");
httpGet.setHeader("12312312");
httpGet.setHeader("123123123");
CloseableHttpResponse response=null;
try {
//使用httpClient发起请求,获取响应
response = httpClient.execute(httpGet);
//解析响应返回结果
if(response.getStatusLine().getStatusCode() == 200){
//判断响应体是否不为空,如果不为空就可以使用EntityUtils
if(response.getEntity() != null){
//下载图片
//获取图片的后缀
String extName = url.substring(url.lastIndexOf("."));
//创建一个图片名,重命名图片
String picName = UUID.randomUUID().toString()+extName;
//下载图片
//声明outputStream
OutputStream outputStream =new FileOutputStream(new File("C:\\Users\\16259\\Desktop\\images\\"+picName));
response.getEntity().writeTo(outputStream);
//返回图片的名称
return picName;
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if(response != null){
try {
response.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
//如果下载失败,返回空串
return "";
}
//设置请求信息
private RequestConfig getConfig() {
RequestConfig config = RequestConfig.custom()
.setConnectTimeout(1000) //创建连接的最长时间
.setConnectionRequestTimeout(500) //获取连接的最长时间
.setSocketTimeout(10 * 1000) //数据传输的最长时间
.build();
return config;
}
}
推荐阅读: