Java爬虫
导入依赖
Apache-HttpClient
SLF4J LOG4J 12 Binding
入门程序
//1.创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//2.创建get请求对象
HttpGet httpGet = new HttpGet("http://www.baidu.com");
//3.HttpClient对象发送get请求
CloseableHttpResponse response = httpClient.execute(httpGet);
//4.获得response
if (response.getStatusLine().getStatusCode() == 200){
HttpEntity httpEntity = response.getEntity();
String content = EntityUtils.toString(httpEntity,"utf-8");
System.out.println(content);
带参数的get请求
//1.创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//创建uri
URIBuilder uriBuilder = new URIBuilder("http://www.itheima.com/");
uriBuilder.setParameter("keys","Java");
//2.创建get请求对象
HttpGet httpGet = new HttpGet(uriBuilder.build());
//3.HttpClient对象发送get请求
CloseableHttpResponse response = httpClient.execute(httpGet);
//4.获得response
if (response.getStatusLine().getStatusCode() == 200){
HttpEntity httpEntity = response.getEntity();
String content = EntityUtils.toString(httpEntity,"utf-8");
System.out.println(content);
}
Post请求
将httpGet改成httpPost
带参数的Post请求
//1.创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//2.创建post请求对象
HttpPost httpPost = new HttpPost("http://www.baidu.com");
//3.创建post表单对象
List<NameValuePair> params = new ArrayList<>();
params.add(new BasicNameValuePair("keys","Java"));
//4.创建表单的entity对象
UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params,"utf8");
//5.将entity放到post请求中
httpPost.setEntity(formEntity);
//6.获得response
CloseableHttpResponse response = httpClient.execute(httpPost);
if (response.getStatusLine().getStatusCode() == 200){
HttpEntity httpEntity = response.getEntity();
String content = EntityUtils.toString(httpEntity,"utf-8");
System.out.println(content);
}
连接池管理器
public static void main(String[] args) throws IOException, URISyntaxException {
//1.创建连接池
PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
//2.设置最大连接数
cm.setMaxTotal(100);
//3.设置每台主机的最大连接数,即网站的最大连接数
cm.setDefaultMaxPerRoute(10);
doGet(cm);
doGet(cm);
}
private static void doGet(PoolingHttpClientConnectionManager cm) throws IOException {
//1.创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
//2.创建get请求对象
HttpGet httpGet = new HttpGet("http://www.itheima.com/");
//3.HttpClient对象发送get请求
CloseableHttpResponse response = httpClient.execute(httpGet);
//4.获得response
if (response.getStatusLine().getStatusCode() == 200){
HttpEntity httpEntity = response.getEntity();
String content = EntityUtils.toString(httpEntity,"utf-8");
System.out.println(content);
}
}
请求配置
HttpGet httpGet = new HttpGet(uriBuilder.build());
RequestConfig config = RequestConfig.custom().setConnectionRequestTimeout(1000) //创建链接最长时间
.setConnectionRequestTimeout(500) //设置获取链接最长时间
.setSocketTimeout(10*1000) //设置数据传输最长时间
.build();
httpGet.setConfig(config); //给请求设置配置信息
Jsoup
前期准备
导入依赖jsoup org.jsoup
commons io //对文件操作
junit
commons lang3
解析html
//解析url地址,获取html,第一个参数是访问的url地址,第二个参数是访问时间的超时时间
Document doc = Jsoup.parse(new URL("http://www.itheima.com"),1000);
//使用标签选择器,获取title中的内容
String title = doc.getElementsByTag("title").first().text();
System.out.println(title);
解析字符串
Document doc = Jsoup.parse(new File("文件地址"),"utf-8");
String title = doc.getElementByTag("title").first.text();
使用dom文档遍历文档
doc.getElementById("id"); //返回element
doc.getElementByTag("title"); //返回elements
doc.getElementByClass("class_a class_b").first(); //返回element
doc.getElementByAttribute("abc")
doc.getElementByAttributeValue("href","www.baidu.com");
获取元素中的数据
element 是一个节点元素
String str = element.id(); //返回string
element.className(); //返回string
element.classNames(); //返回string的set集合 Set<String>
element.attr("class") //返回属性的值
element.attributes(); //返回attributes集合
element.text(); //返回string 元素的值
使用选择器查询元素
doc.select("span"); //返回elements
doc.select("#id"); //返回elements
doc.select("[abc]"); //通过属性abc获取
doc.select("[class=className]");
任意组合
doc.select("span[abc]"); //元素和属性的组合
h3#id //元素+id
h3.className //元素+类名
WebMagic
导入依赖
WebMagic core
WebMagic Extension
入门程序
public class Demo01 implements PageProcessor {
@Override
public void process(Page page) {
//解析页面
//Css
//也有xpath,也有正则表达式
page.putField("div",page.getHtml().css("title").regex(".*s")all());
}
private Site site = Site.me();
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new Demo01())
.addUrl("https://www.itheima.com/") //设置爬取页面
.run();
}
}
page.getHtml().css("title").regex(".*s").all() //所有结果
page.getHtml().css("title").regex(".*s").get() //结果的第一条
page.getHtml().css("title").regex(".*s").toString() //同get
获取链接
page.addTargetRequests(page.getHtml().css().links().all());
page.putFiled()
多线程
Spider.create(new Demo01())
.addUrl("https://www.biqugeg.com/20_20368/15354666.html") //设置爬取页面
.addPipeline(new FilePipeline("C:\\Users\\WX\\Desktop\\新建文件夹"))
.thread(5)
.run();
爬虫的配置、启动和终止
private Site site = Site.me()
.setCharset("utf8") //设置编码
.setTimeOut(10000) //设置超时时间 单位ms
.setRetrySleepTime(1000) //设置重试时间间隔 单位ms
.setSleepTime(3) //设置重试次数
;
@Override
public Site getSite() {
return site;
}
定时任务
schedule