Maven 里编写爬虫程序
Create New Project---->Maven _---->create from archetype ---->quickstart----->建工程-----> 点击 Enable Auto_import
1.下的1.7改为1.8
2.下添加镜像:
org.apache.httpcomponents
httpclient
4.5.5
添加后为
3.点击红色框处,选择左侧Modules,Sources下选择点击应用
4. file–>setting---->Complier—>java Compiler选择8,点击应用
5.删除Project下的test,在main中建包,类
代码如下:
//Spider*
package org.example.core;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Spider {
private static CloseableHttpClient httpClient = HttpClients.createDefault();
private static final int M = 1024*1024;
private static void close(AutoCloseable...closes){
for (AutoCloseable close : closes) {
if (null!=close) {
try {
close.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
//1、获取列表页的源码
private static String getHtml(String url){
String html = null;
HttpGet httpGet = new HttpGet(url);
try {
CloseableHttpResponse httpResponse = httpClient.execute(httpGet);
HttpEntity entity = httpResponse.getEntity();
html = EntityUtils.toString(entity);
html = html.replaceAll("\r?\n\t*","");
} catch (IOException e) {
e.printStackTrace();
}
return html;
}
//2、解析源码,获取详情页的链接
private static String replacePathSign(String name){
return name.replaceAll("/|\\\\","");
}
private static Map<String,String> parseHtml(String html, String regex){
Map<String,String> map = new HashMap<>();
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(html);
int index = 0;
while (matcher.find(index)) {
map.put(matcher.group(1),replacePathSign(matcher.group(2)));
index = matcher.end();
}
return map;
}
//3、数据持久化
private static void disk(String imgUrl,String path,String name){
BufferedInputStream bis = null;
BufferedOutputStream bos = null;
try {
bis = new BufferedInputStream(
httpClient.execute(new HttpGet(imgUrl))
.getEntity().getContent(),M);
bos = new BufferedOutputStream(
new FileOutputStream(path+"/"+name+".jpg"),M);
byte[] bs = new byte[M];
int len = -1;
while (-1 != (len = bis.read(bs))) {
bos.write(bs,0,len);
}
bos.flush();
} catch (IOException e) {
e.printStackTrace();
} finally {
close(bos,bis);
}
}
private static String getReplace(String regexPage){
return regexPage.substring(
regexPage.indexOf("(")+1,regexPage.indexOf(")"));
}
private static String regexFill(String replace,int page){
String[] split = replace.split("\\\\d\\+\\\\");
return split[0]+page+split[1];
}
/**
*
* @param url 页面url
* @param regexTotal 提取页码正则
* @param pageSize 每页数据容量
* @param regexPage 分页页码替换正则
* @param regxUrl 提取数据项正则
* @param path 写盘路径
*/
public static void crawl(String url,String regexTotal,int pageSize,String regexPage,String regxUrl,String path){
String html = getHtml(url);
System.out.println(html);
//总页码正则匹配器
Matcher matherTotal = Pattern.compile(regexTotal).matcher(html);
//分页特征正则匹配器
Matcher matcherPage = Pattern.compile(regexPage).matcher(url);
int total = 1;
String replaceRegx = null;
boolean pageMatches = matcherPage.matches();
if(pageMatches){ //能够提取分页特征
replaceRegx = getReplace(regexPage);
if(matherTotal.matches()) { //能够提取总页码
String totalSize = matherTotal.group(1);
System.out.println("total size : "+totalSize);
total = (int)(Math.ceil(Double.parseDouble(totalSize)/pageSize));
}
}else{
System.err.println(regexPage+" 无法提取分页特征,将作为单页解析");
}
System.out.println("total page : "+total);
int page = 1;
ExecutorService es = Executors.newFixedThreadPool(
Runtime.getRuntime().availableProcessors());
do {
final String HTML = html;
final int PAGE_NO = page;
es.submit(()->{
Map<String, String> map = parseHtml(HTML, regxUrl);
if (!map.isEmpty()) {
for (Map.Entry<String, String> e : map.entrySet()) {
disk(e.getKey(), path, e.getValue());
}
}
System.out.println("page "+PAGE_NO+" finished");
});
if(!pageMatches) break;
page++;
url = url.replaceFirst(replaceRegx,regexFill(replaceRegx,page));
html = getHtml(url);
}while (page<=total);
es.shutdown();
}
/**
*
* @param url 页面url
* @param regexTotal 提取页码正则
* @param pageSize 每页数据容量
* @param regexPage 分页页码替换正则
* @param regxUrl 提取数据项正则
* @param path 写盘路径
*/
public static void crawl2(String url,String regexTotal,int pageSize,String regexPage,String regxUrl,String path){
String html = getHtml(url);
System.out.println(html);
//总页码正则匹配器
Matcher matherTotal = Pattern.compile(regexTotal).matcher(html);
//分页特征正则匹配器
Matcher matcherPage = Pattern.compile(regexPage).matcher(url);
int total = 1;
String replaceRegx = null;
boolean pageMatches = matcherPage.matches();
if(pageMatches){ //能够提取分页特征
replaceRegx = getReplace(regexPage);
if(matherTotal.matches()) { //能够提取总页码
String totalSize = matherTotal.group(1);
System.out.println("total size : "+totalSize);
total = (int)(Math.ceil(Double.parseDouble(totalSize)/pageSize));
}
}else{
System.err.println(regexPage+" 无法提取分页特征,将作为单页解析");
}
System.out.println("total page : "+total);
int page = 1;
ExecutorService es = Executors.newFixedThreadPool(
Runtime.getRuntime().availableProcessors());
do {
final String HTML = html;
final int PAGE_NO = page;
Map<String, String> map = parseHtml(HTML, regxUrl);
if (!map.isEmpty()) {
for (Map.Entry<String, String> e : map.entrySet()) {
es.submit(()->disk(e.getKey(), path, e.getValue()));
}
}
System.out.println("page "+PAGE_NO+" finished");
if(!pageMatches) break;
page++;
url = url.replaceFirst(replaceRegx,regexFill(replaceRegx,page));
html = getHtml(url);
}while (page<=total);
es.shutdown();
}
}
**//Test:**
package org.example.core;
public class Test {
public static void main(String[] args) {
String url = "https://www.3gbizhi.com/tag/dongman/1.html";
String regexTotal = ".*?<a class=\"a1\">(.*?)条</a>.*?";
String regexPage = ".*?(/\\d+\\.html)";
//String regexUrl = "<img src=\"(.*?)\\.\\d+\\.\\d+\\.jpg\".*?alt=\"(.*?)\">";
String regexUrl = "<img lazysrc=\"(.*?)\\.\\d+\\.\\d+\\.jpg\".*?alt=\"(.*?)\".*?/>";
String path = "E:\\123\\spider_path";
Spider.crawl2(url,regexTotal,20,regexPage,regexUrl,path);
}
}