java爬虫有token_JAVA爬虫实践(实践一:知乎)

该博客展示了如何使用Java实现一个简单的知乎爬虫,通过模拟HTTP GET和POST请求获取页面数据。首先,利用HttpClient发送GET请求并设置必要头部信息,包括Cookie和User-Agent。然后,解析返回的HTML内容,使用Jsoup提取标题和内容。接着,进行POST请求获取更多数据,更新并追加到文件中。整个过程中,注意了请求头中的X-Xsrftoken和Cookie值,确保请求的合法性。
摘要由CSDN通过智能技术生成

packagespider;importjava.io.BufferedReader;importjava.io.File;importjava.io.FileWriter;importjava.io.IOException;importjava.io.InputStream;importjava.io.InputStreamReader;importjava.io.PrintWriter;importnet.sf.json.JSONArray;importnet.sf.json.JSONObject;importorg.apache.http.HttpEntity;importorg.apache.http.HttpResponse;importorg.apache.http.client.ClientProtocolException;importorg.apache.http.client.HttpClient;importorg.apache.http.client.methods.HttpGet;importorg.apache.http.client.methods.HttpUriRequest;importorg.apache.http.client.methods.RequestBuilder;importorg.apache.http.impl.client.DefaultHttpClient;importorg.jsoup.Jsoup;importorg.jsoup.nodes.Document;importorg.jsoup.nodes.Element;importorg.jsoup.select.Elements;importorg.junit.Test;

@SuppressWarnings("deprecation")public classZhihuSpider {/*** 模拟HTTP GET请求*/

public String doGet() throwsClientProtocolException, IOException {

String str= "";//创建HttpClient实例

HttpClient httpClient = newDefaultHttpClient();//创建Get方法实例

HttpUriRequest httpUriRequest = new HttpGet("http://www.zhihu.com");//添加必要的头信息

httpUriRequest

.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0");

httpUriRequest

.setHeader("Cookie","这里的Cookie拷贝复制登录后请求头里的Cookie值");

httpUriRequest.setHeader("DNT", "1");

httpUriRequest.setHeader("Connection", "keep-alive");

httpUriRequest.setHeader("Upgrade-Insecure-Requests", "1");

httpUriRequest.setHeader("Cache-Control", "max-age=0");

HttpResponse response=httpClient.execute(httpUriRequest);

HttpEntity entity=response.getEntity();if (entity != null) {

InputStream inputStream=entity.getContent();

str=convertStreamToString(inputStream);

}returnstr;

}public staticString convertStreamToString(InputStream is)throwsIOException {

InputStreamReader ir= new InputStreamReader(is, "UTF8");

BufferedReader reader= newBufferedReader(ir);

StringBuilder sb= newStringBuilder();

String line= null;try{while ((line = reader.readLine()) != null) {

sb.append(line+ "\n");

}

}catch(IOException e) {

e.printStackTrace();

}finally{try{

is.close();

}catch(IOException e) {

e.printStackTrace();

}

}returnsb.toString();

}//下载 URL 指向的网页

@SuppressWarnings("static-access")

@Testpublic void downloadFile() throwsException {//模拟HTTP GET请求

String responseBody =doGet();//解析数据

String writeStr =unparsedData(responseBody);//创建新文件

String path = "D:\\testFile\\zhihu.txt";

PrintWriter printWriter= null;

printWriter= new PrintWriter(new FileWriter(newFile(path)));//写内容

printWriter.write(writeStr);

printWriter.close();int offset = 10;int start = 9;for (int time = 0; time <= 100; time++) {//模拟POST请求

JSONObject jsonObject =JSONObject

.fromObject(doPost(offset, start));//解析数据(只拿JSON数据里的msg数组)

String addWriteStr = "";

JSONArray jsonArray= jsonObject.getJSONArray("msg");

Object[] arrays=jsonArray.toArray();for(Object array : arrays) {

addWriteStr+=unparsedData(array.toString());

}//追加文本

printWriter = new PrintWriter(new FileWriter(path, true));

printWriter.write(addWriteStr);

printWriter.close();//延时,调整参数

Thread.currentThread().sleep(1000);//毫秒

offset = offset + 10;

start= start + 10;

}

}/*** 根据HTML解析数据

*

*@paramhtml

* 源HTML

*@return解析后的数据*/

publicString unparsedData(String html) {

Document doc=Jsoup.parse(html);

Elements feeds= doc.getElementsByAttributeValue("class","feed-item-inner");

String writeStr= "";for(Element feed : feeds) {

Elements title= newElements();

Elements feedTitles= feed.getElementsByAttributeValue("class","feed-title");for(Element feedTitle : feedTitles) {

title= feedTitle.getElementsByTag("a");

}

Elements content= feed.getElementsByTag("textarea");

String titleHref= title.attr("href");

String titleText=title.text().trim();

String contentText=content.text().trim();//if(!titleText.contains("人民的名义")){//continue;//}

System.out.println("--------------------");

System.out.println("-----标题-----");

System.out.println("链接:" +titleHref);

System.out.println("内容:" +titleText);

System.out.println("-----内容-----");

System.out.println("内容:" +contentText);

System.out.println("--------------------");

writeStr+= "--------------------\n-----标题-----\n" +titleHref+ "\n" + titleText + "\n-----内容-----\n" +contentText+ "\n--------------------\n\n\n";

}returnwriteStr;

}/*** 模拟HTTP POST请求

*

*@paramoffset

* 参数offset

*@paramstart

* 参数start

*@return请求返回的JSON数据*/

public String doPost(int offset, int start) throwsException {

HttpClient httpClient= newDefaultHttpClient();

HttpUriRequest httpUriRequest=RequestBuilder

.post()

.setUri("https://www.zhihu.com/node/TopStory2FeedList")

.addParameter("params","{\"offset\":" + offset + ",\"start\":\"" +start+ "\"}").addParameter("method", "next").build();//添加必要的头信息

httpUriRequest

.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0");

httpUriRequest.setHeader("X-Xsrftoken","这里的X-Xsrftoken拷贝复制登录后请求头里的X-Xsrftoken值");

httpUriRequest.setHeader("X-Requested-With", "XMLHttpRequest");

httpUriRequest.setHeader("Referer", "https://www.zhihu.com/");

httpUriRequest

.setHeader("Cookie","这里的Cookie拷贝复制登录后请求头里的Cookie值");

httpUriRequest.setHeader("DNT", "1");

httpUriRequest.setHeader("Connection", "keep-alive");

httpUriRequest.setHeader("Cache-Control", "max-age=0");

HttpResponse response=httpClient.execute(httpUriRequest);

String str= "";

HttpEntity entity=response.getEntity();if (entity != null) {

InputStream instreams=entity.getContent();

str=convertStreamToString(instreams);

}returnstr;

}

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值