爬:https://movie.douban.com/top250 豆瓣电影Top250的电影,爬取下面框出来的内容,爬出来的数据放在logback的日志里。
爬于2020 3月初
用HttpClient获取页面,用Jsoup进行解析。
效果如下:
如果后续有导入数据库需求的,分隔符自己换下。
代码:
package com.lane.httpclient;
import org.apache.http.HttpEntity;
import org.apache.http.StatusLine;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class HttpClientDemo {
private static Logger logger = LoggerFactory.getLogger(HttpClientDemo.class);
@Test
//从URL获取content
public String requestData(String URL) throws InterruptedException {
//1. 创建httpClient对象,相当于一个浏览器
//利用它,模仿浏览器发送请求
CloseableHttpClient httpClient= HttpClientBuilder.create().build();
//2. 创建http请求对象 这里的是get请求,创建httpGet对象,用来封装http请求数据。
HttpGet httpGet = new HttpGet(URL);
//可以模拟不同的客户端
//随机选一个header
String[] headers = {"UMozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.5 Safari/605.1.15",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/13.0.782.215)",
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/5.0 Opera 11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; ru-RU) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36 OPR/37.0.2178.32",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.3 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.277.400 QQBrowser/9.4.7658.400",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 UBrowser/5.6.12150.8 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 TheWorld 7",
"Mozilla/5.0 (Windows NT 6.1; W…) Gecko/20100101 Firefox/60.0"
};
int index = (int) (Math.random() * headers.length);
String header = headers[index];
httpGet.setHeader("User-Agent", header);
//3. response用于接收服务器响应,封装了响应的状态行、响应体
CloseableHttpResponse response =null;
// 4.执行get请求,返回response响应
try {
response=httpClient.execute(httpGet);
//5 获取响应状态行信息 200、404、500.........
StatusLine statusLine=response.getStatusLine();
if(statusLine.getStatusCode()!=200)
{
return null;
}
//6 获取响应体中的数据
HttpEntity httpEntity=response.getEntity();
//7 转换格式 Entity转化成字符串,即响应体里的内容
String content=EntityUtils.toString(httpEntity);
return content;
} catch (IOException e) {
e.printStackTrace();
}finally {
if(response!=null)
{
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if(httpClient!=null)
{
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return null;
}
//获取每部分电影的内容
public void getMovie() throws InterruptedException {
int i=1;
String content;
String nextPage="https://movie.douban.com/top250";
for(int j=1;j<=10;j++) {
content=requestData(nextPage);
//1 将传进来的html封装成jsoup中的document对象
Document document = Jsoup.parse(content);
//2 使用jsoup document API 解析文档
Elements elements = document.select("div.item");
for (Element element : elements) {
String href = element.select("div.hd > a").attr("href");
//获取页面内容
content=requestData(href);
parseHtml(content, i);
int time = (int) ((Math.random() * 10000) - 1);
Thread.currentThread().sleep(time);
i++;
}
//获取下一页
nextPage="https://movie.douban.com/top250"+document.select("span.next > a").attr("href");
System.out.println(nextPage);
}
}
/**
* 解析
* @param content
*/
public void parseHtml(String content,int i)
{
Document document= Jsoup.parse(content);
Element element =document.selectFirst("div#info");
//处理文本节点
//国家,语言,别名这三个必是标签外的文本节点,有不同片长的才会出现片长的文本节点
List<TextNode> list=element.textNodes();
List<String> list1=new ArrayList<String>();
String regEx = "[\\u4e00-\\u9fa5]";
Pattern p = Pattern.compile(regEx);
for(Iterator<TextNode> it=list.iterator();it.hasNext();){
String str=it.next().text();
Matcher m = p.matcher(str);
//判断是否含有中文
if(m.find()) {
list1.add(str);
}
}
//电影名
String name="电影名: "+document.selectFirst("h1").select("span[property=v:itemreviewed]").html();
//System.out.println(name);
//评分
String score="评分: "+document.select("div.rating_self.clearfix").select("strong.ll.rating_num").html();
//System.out.println(score);
//评分人数
String number="评分人数: "+document.select("div.rating_right").select("div.rating_sum > a > span").html()+"人";
//System.out.println(number);
//导演
String director="导演: "+element.select("span").first().select("span.attrs > a").html();
//System.out.println(director);
//编剧
String p1 =element.select("span").eq(3).select("span.attrs > a").html();
String playwriter="编剧: "+p1.replace('\n','/');
//System.out.println(playwriter);
//主演
String a=element.select("span.actor").select("span.attrs > a").html();
String actor="主演: "+a.replace('\n','/');
//System.out.println(actor);
//类型
//text不用加' '
String t=element.select("span:contains(类型:) ~ span[property=v:genre]").html();
String genre="类型: "+t.replace('\n','/');
//System.out.println(genre);
//制片国家
//标签之外的文本节点
String country="制片国家/地区:"+list1.get(0);
//System.out.println(country);
//语言
String language="语言:"+list1.get(1);
//System.out.println(language);
//上映日期
String d=element.select("span:contains(上映日期:) ~ span[property=v:initialReleaseDate]").html();
String date="上映日期: "+d.replace('\n','/');
//System.out.println(date);
//片长
String l=element.select("span:contains(片长:) + span[property]").html();
String length;
//有片长信息是标签外的文本节点
if(list1.size()>3)
{
length="片长: "+l+list1.get(2);
}
else length="片长: "+l;
//System.out.println(length);
//又名
String alias="又名:"+list1.get((list1.size())-1);
//System.out.println(alias);
//记录日志
StringBuilder stringBuilder=new StringBuilder();
stringBuilder.append(name+'\t'+score+'\t'+number+'\t'+director+'\t'+playwriter+'\t'+actor+'\t'+genre+'\t'+country+'\t'+
language+'\t'+date+'\t'+length+'\t'+alias);
logger.info(stringBuilder.toString());
}
public static void main(String[] args) throws InterruptedException {
HttpClientDemo httpClientDemo=new HttpClientDemo();
httpClientDemo.getMovie();
}
}
分析:很简单,只有一个点注意就是 像国家这样的在标签外的文本节点特别处理下,其中国家,语言,别名这三个必是标签外的文本节点,有不同片长的电影才会出现片长的文本节点。(只有一个片长的电影的片长文本会在标签内部,如果不止一个片长会出现在标签外部)