此文章用作记录,在开发自己的爬虫项目时,发现有些网站需要登录后才能继续获取数据,经过查询解决了此问题
其实很简单,只需要添加几行代码就可以实现了;
此方法只适用于HttpURLConnection客户端;
在我们原来的请求前添加如下代码
CookieManager cookieManager = new CookieManager();
// 将规则改掉,接受所有的 Cookie
cookieManager.setCookiePolicy(CookiePolicy.ACCEPT_ALL);
CookieHandler.setDefault(cookieManager);
//使用示例
String urlLogin = "https://tieba.baidu.com/login.html";
String urlAfter = "https://tieba.baidu.com/data.html?pg=1";
//先登录
TriHttpRequest request = new TriHttpRequest(urlLogin,"UTF-8");
request.setContentType("application/x-www-form-urlencoded");
request.addPostValue("username","xxx");
request.addPostValue("password","xxx");
request.post();
//第一次请求后的cookies
printCookie(cookieManager.getCookieStore());
//接着去采集
request=new TriHttpRequest(urlAfter,"UTF-8");
request.setContentType("application/x-www-form-urlencoded");
request.addPostValue("keyword","xxxx");
String res = request.post().getReponse();
//第二次请求后的cookies
printCookie(cookieManager.getCookieStore());
logger.info(res);//cookies详细
private void printCookie(CookieStore cookieStore){
List<HttpCookie> listCookie = cookieStore.getCookies();
listCookie.forEach(httpCookie -> {
System.out.println("--------------------------------------");
System.out.println("class : "+httpCookie.getClass());
System.out.println("comment : "+httpCookie.getComment());
System.out.println("commentURL : "+httpCookie.getCommentURL());
System.out.println("discard : "+httpCookie.getDiscard());
System.out.println("domain : "+httpCookie.getDomain());
System.out.println("maxAge : "+httpCookie.getMaxAge());
System.out.println("name : "+httpCookie.getName());
System.out.println("path : "+httpCookie.getPath());
System.out.println("portlist : "+httpCookie.getPortlist());
System.out.println("secure : "+httpCookie.getSecure());
System.out.println("value : "+httpCookie.getValue());
System.out.println("version : "+httpCookie.getVersion());
System.out.println("httpCookie : "+httpCookie);
});
}上方的TriHttpxxx工具类 详见此文章
部分HTTPS可能会出现证书错误,详见此文章详见此文章
代码加入TriHttpRequest
if("https".equalsIgnoreCase(this.url.getProtocol())){
try{
SslUtils.ignoreSsl();
}catch (Exception e){
e.printStackTrace();
}
}
//这行之前
this.conn = ((HttpURLConnection) this.url.openConnection());
