自学Python 来写一个爬虫吧 ----> JAVA实现
1.HttpOpener.java 模拟浏览器行为,获取页面信息。
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import org.apache.http.Header;
import org.apache.http.HttpHeaders;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.CookieStore;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.cookie.Cookie;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.message.BasicHeader;
import org.apache.http.message.BasicNameValuePair;
import com.alibaba.fastjson.JSONObject;
public class HttpOpener {
private static HttpOpener opener = null;
private static HttpClient httpClient = null;
private static HttpClientContext httpClientContext = null;
private static String cookiefile = "cookie";
public HttpClientContext getHttpClientContext() {
return httpClientContext;
}
private HttpOpener() {//私有构造方法 单例模式 共享cookie
//构造头
List<Header> headers = new ArrayList<Header>();
headers.add(new BasicHeader(HttpHeaders.ACCEPT, "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"));
headers.add(new BasicHeader(HttpHeaders.USER_AGENT, "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36"));
headers.add(new BasicHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate, sdch"));
headers.add(new BasicHeader(HttpHeaders.CACHE_CONTROL, "max-age=0"));
headers.add(new BasicHeader(HttpHeaders.CONNECTION, "keep-alive"));
headers.add(new BasicHeader(HttpHeaders.ACCEPT_LANGUAGE, "zh-CN,zh;q=0.8"));
//构造HttpClient
try {
httpClient = HttpClientBuilder.create()
.setDefaultCookieStore(readCookie())
.setDefaultHeaders(headers)
.build();
} catch (Exception e) {//第一次打开?cookie文件不存在?……
httpClient = HttpClients.custom()
.setDefaultHeaders(headers)
.build();
}
httpClientContext = HttpClientContext.create();
try {
httpClientContext.setCookieStore(readCookie());
} catch (Exception e) {
e.printStackTrace();
}
}
public static HttpOpener getOpener() {
if(opener==null) {
opener = new HttpOpener();
}
return opener;
}
public HttpResponse doGet(String url,Map<String,String> data) throws URISyntaxException, ClientProtocolException, IOException {
//get方式 构造路径参数 就是?a=b&c=d 表单参数
List<NameValuePair> dataList = new ArrayList<NameValuePair>();
if (data!=null) {
Set<String> keys = data.keySet();
for (String key:keys) {
dataList.add(new BasicNameValuePair(key,data.get(key)));
}}
//构造请求路径,并添加参数
URI uri = new URIBuilder(url).addParameters(dataList).build();
//构造请求
HttpUriRequest httpUriRequest = RequestBuilder.get().setUri(uri).build();
//获取结果
HttpResponse httpResponse = httpClient.execute(httpUriRequest,httpClientContext);
saveCookie();//存cookie
return httpResponse;
}
public HttpResponse doPost(String url,Map<String,String> data) throws Exception{
HttpPost httpPost = new HttpPost(url);
httpPost.setHeader("ContentType", "application/json");
httpPost.setHeader("Connection", "Close");
if(url.startsWith("http:")) {//http方式的post
String jsonObj = JSONObject.toJSONString(data);
// 构建消息实体
StringEntity entity = new StringEntity(jsonObj, Charset.forName("UTF-8"));
entity.setContentEncoding("UTF-8");
entity.setContentType("application/json");
// 发送Json格式的数据请求
httpPost.setEntity(entity);
}else if(url.startsWith("https:")) {//https方式的post
SSLContext sslcontext = createIgnoreVerifySSL();
// 设置协议http和https对应的处理socket链接工厂的对象
Registry<ConnectionSocketFactory> socketFactoryRegistry = RegistryBuilder.<ConnectionSocketFactory>create()
.register("https:", new SSLConnectionSocketFactory(sslcontext))
.build();
PoolingHttpClientConnectionManager connManager = new PoolingHttpClientConnectionManager(socketFactoryRegistry);
HttpClients.custom().setConnectionManager(connManager);
List<NameValuePair> datalist = new ArrayList<NameValuePair>();
if(data!=null){//表单数据
for (Entry<String, String> entry : data.entrySet()) {
datalist.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
}
}
httpPost.setEntity(new UrlEncodedFormEntity(datalist,"UTF-8"));
}else {
throw new Exception("Error Url,it should been started with 'http' or 'https'.");
}
HttpResponse httpResponse = httpClient.execute(httpPost,httpClientContext);
return httpResponse;
}
private void saveCookie() throws IOException{
List<Cookie> cookies = httpClientContext.getCookieStore().getCookies();
File file = new File(cookiefile);
try {
file.createNewFile();
} finally {
OutputStream writer = new FileOutputStream(file);
StringBuffer sb = new StringBuffer("");
for(Cookie cookie:cookies) {
sb.append(cookie.toString());
sb.append("\n");
}
writer.write(sb.toString().getBytes(), 0, sb.toString().getBytes().length);
writer.close();
}
}
@SuppressWarnings("resource")
private CookieStore readCookie() throws IOException{
CookieStore cookieStore = httpClientContext.getCookieStore();
if(cookieStore == null) {
cookieStore = new BasicCookieStore();
}
File file = new File(cookiefile);
BufferedReader reader;
try {
reader = new BufferedReader(new FileReader(file));
} catch (Exception e) {
return null;
}
String text = reader.readLine();
BasicClientCookie cookie;
Date date = new Date();
Long time = date.getTime()+Integer.MAX_VALUE;
while(text!=null&&text!="") {
Map<String,String> ckmap = new HashMap<String,String>();
text = text.replace("[","");
String[] list = text.split("]");
for (String str:list) {
if(str.length()>5) {
ckmap.put(str.split(": ", 2)[0],str.split(": ", 2)[1]);
}
}
cookie = new BasicClientCookie(ckmap.get("name"), ckmap.get("value"));
cookie.setDomain(ckmap.get("domain"));
cookie.setPath(ckmap.get("path"));
cookie.setVersion(Integer.parseInt(ckmap.get("version")));
cookie.setExpiryDate((ckmap.get("expiryDate")==null)?null:(new Date(time)));
cookieStore.addCookie(cookie);
text = reader.readLine();
}
return cookieStore;
}
//绕过https验证 百度的……
private static SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException {
SSLContext sc = SSLContext.getInstance("SSLv3");
// 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法
X509TrustManager trustManager = new X509TrustManager() {
public void checkClientTrusted(
java.security.cert.X509Certificate[] paramArrayOfX509Certificate,
String paramString) throws CertificateException {
}
public void checkServerTrusted(
java.security.cert.X509Certificate[] paramArrayOfX509Certificate,
String paramString) throws CertificateException {
}
public java.security.cert.X509Certificate[] getAcceptedIssuers() {
return null;
}
};
sc.init(null, new TrustManager[] { trustManager }, null);
return sc;
}
}
这里需要的就是doGet方法,获取页面。构造方法可以再添加代理……(日后再说)
接下来就是对页面获取的内容进行处理,得到需要的部分,这里我创建了一工具类完成部分操作:
package Utils;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.http.conn.HttpHostConnectException;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.jsoup.Jsoup;
import org.jsoup.select.Elements;
public class Utils {
//常见字 可能出现在抓取内容里且编码方式错了不会有的字 中文网站用中文,全角标点等 越多越准确,概率大的放前面能加快速度
private static final String[] strs= {"章","了","。","一","二","三","四","五","六","七","八","九","十","百","千","我"};
/**
* 字节-->文件
* @param bs
* @param path 保存路径
* @return 保存路径
* @throws IOException
*/
public static String WriteInFile(byte[] bs,String path) throws IOException {//以字节方式写入文件
File img = new File(path);
FileOutputStream writer = new FileOutputStream(img);
writer.write(bs);
writer.close();
return path;
}
/**
* 输入流 --> 文件
* @param is 输入流
* @param path 保存路径
* @return 保存路径
* @throws IOException
*/
public static String WriteInFile(InputStream is,String path) throws IOException {//以字节方式写入文件
File img = new File(path);
FileOutputStream writer = new FileOutputStream(img);
byte[] bs = null;
is.read(bs);
is.close();
writer.write(bs);
writer.close();
return path;
}
/**
* 输入流转字符串 默认编码 实际上用不到
* @param in 输入流
* @return 字符串
* @throws IOException
*/
public static String Stream2String(InputStream in) throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(in));
StringBuffer sb = new StringBuffer();
String line;
while((line=reader.readLine())!=null) {
sb.append(line);
}
reader.close();
in.close();
return sb.toString();
}
/**
* 输入流转字符串 制定编码方式
* @param in 输入流
* @param code 编码方式
* @return 字符串
* @throws IOException
*/
public static String Stream2String(InputStream in,String code) throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(in,code));
StringBuffer sb = new StringBuffer();
String line;
while((line=reader.readLine())!=null) {
if(line.startsWith("<!")){//去掉网页前面的 <!DOCTYPE html> 影响Jsoup正常工作
continue;
}
sb.append(line);
}
reader.close();
in.close();
return sb.toString();
}
/**
* XML转MAP
* @param str XML
* @return MAP
*/
public static Map<String,Object> XML2Map(String str){//XML 跟 html 还是很不一样的
Document doc = null;
try {
doc = DocumentHelper.parseText(str);
} catch (DocumentException e) {
e.printStackTrace();
}
Map<String, Object> map = new HashMap<String, Object>();
if (doc == null){
return null;
}
Element rootElement = doc.getRootElement();
element2map(rootElement,map);
return map;
}
/**
* 遍历 XML 树状结构嘛
* @param outele
* @param outmap
* @return
*/
@SuppressWarnings("unchecked")
private static Map<String, Object> element2map(Element outele, Map<String, Object> outmap) {
List<Element> list = outele.elements();//返回子节点数目
int size = list.size();
if(size == 0){//当前节点是无子节点
outmap.put(outele.getName(), outele.getTextTrim());
}else if(size == 1){//当前节点只有一个子节点
Map<String, Object> innermap = new HashMap<String, Object>();
Element ele1 = list.get(0);
element2map(ele1,innermap);
outmap.put(outele.getName(), innermap);
}else if(size > 1){//当前节点有多个子节点
Map<String, Object> innermap = new HashMap<String, Object>();
for(Element ele1 : list){
String eleName = ele1.getName();
Object obj = innermap.get(eleName);//获取MASTER
if(obj == null){//如果该MASTER不存在,现在有一个MASTER过来
element2map(ele1,innermap);
}else{
if(obj instanceof java.util.Map){//如果没有生成过list,把原来的单个map合并到新的list
innermap.remove(eleName);
List<Map<String, Object>> list1 = new ArrayList<Map<String, Object>>();
list1.add((Map<String, Object>) obj);
Map<String, Object> map1 = new HashMap<String, Object>();
element2map(ele1,map1);
list1.add((Map<String, Object>) map1.get(eleName));
innermap.put(eleName, list1);
}else if(obj instanceof java.util.List){//如果已经生成过list
element2map(ele1,innermap);
((List<Map<String, Object>>)obj).add(innermap);
}
}
}
outmap.put(outele.getName(), innermap);
}
return outmap;
}
/**url处理 原始页面上存在多种url 全部处理为完整的路径
*
* @param Baseurl 根目录
* @param Lasturl 当前目录
* @param url 任务url
* @return 完整url
* @throws Exception
*/
public static String Url2Url(String Baseurl,String Lasturl,String url) throws Exception{
if(url.startsWith("http://")||url.startsWith("https://")){
//完整路径 http://www.baidu.com
return url;
}else if(url.indexOf("/")<0||url.startsWith("../")){
//相对路径 ../../index.html(上两级目录下的index.html)
String[] urls = url.split("../");//给个 ../ 抵消当前路径的一个/
for(int i=0;i<urls.length;i++){
Lasturl = Lasturl.substring(0,Lasturl.lastIndexOf("/"));
}
return Lasturl +"/"+ urls[urls.length-1];
}else if(url.indexOf("/")==0){
//绝对路径 /en/index.html (网站根目录下的en/index.html)
return Baseurl + url.substring(1);
}else{
throw new Exception("Error Url");//这就是在逗我了
}
}
//URL处理的批量方法
public static List<String> Url2Url(String Baseurl,String Lasturl,List<String> urls){
for(int i=0;i<urls.size();i++){
try{
urls.set(i, Url2Url(Baseurl, Lasturl, urls.get(i)));
}
catch(Exception e){
}
}
return urls;
}
//用opener获取url上的资源,编码方式为code,循环css下的child元素的value属性值
public static List<String> Url2Get(HTTPOpener opener,String url,String code,String css,String child,String value,boolean notest) {
List<String> list = new ArrayList<>();
try {
InputStream html = opener.doGet(url, null).getEntity().getContent();
/*
* 由于重复尝试编码,解决InputStream只能的问题一次
*/
ByteArrayOutputStream htmlcopy = new ByteArrayOutputStream();
byte[] buffer = new byte[1024];
int len;
while ((len = html.read(buffer)) > -1 ) {
htmlcopy.write(buffer, 0, len);
}
htmlcopy.flush();
//常见放前面 加快速度||范围小的放前面 减少异常
String[] basecodes = {"UTF-8","GBK","ASCII","ISO-8859-1","GB2312","Unicode"};
String[] codes;
if(code==null){//未指定编码
codes = basecodes;
}else{//指定编码 加在最前面
codes = new String[basecodes.length+1];
System.arraycopy(basecodes, 0, codes, 1, basecodes.length);
codes[0] = code;
}
url = url.lastIndexOf('/')>0?(url.substring(0,url.lastIndexOf('/'))):null;
for(String icode:codes){//循环编码
html = new ByteArrayInputStream(htmlcopy.toByteArray());//解决InputStream只能读一次
org.jsoup.nodes.Document doc = Jsoup.parse(html, icode, url);
/*Java 和 Python的编码解码不一样
* Java 指定了编码方式就能解下去,不会报错,需自己再次验证
* Python 指定了编码,如果不对会报错,然后就可以尝试下一个了
*/
Elements es = doc.select(css);
for(org.jsoup.nodes.Element e : es){
org.jsoup.nodes.Element one = null;
if(child!=null&&!child.equals("")){
one = e.selectFirst(child);}
else {
one = e;
}
if(one!=null){
if(value == null){
if(notest||testlist(one.text()))//不测试或者测试通过
list.add(one.text());
}else{
if(notest||testlist(one.attr(value)))//不测试或者测试通过
list.add(one.attr(value));
}//测试而且没通过 数组为空
}
}
if(list!=null&&list.size()!=0){
html.close();
break;//抓到了东西
}else{
list.clear();//清掉,换编码重试
//continue;
}
}
html.close();
htmlcopy.close();
} catch (HttpHostConnectException e) {//http连接出错 重试
try {
Thread.sleep((int)(20000*Math.random()));
} catch (InterruptedException e1) {
e1.printStackTrace();
}
return Url2Get(opener, url, code, css, child, value,notest);
}catch(Exception e){
e.printStackTrace();
list.add("");//这里出问题后面抓取就是null,然后就出错了
list.add("");
}
return list;
}// 查找css元素 获取该元素下的child元素的value属性值 这次可以在一个页面上抓多次 数组的长度
public static List<List<String>> Url2Get(HTTPOpener opener, String url,String code, String[] css, String[] child, String[] value,boolean[] notest) {
if(css.length==value.length&&css.length==child.length&&css.length==notest.length){
List<List<String>> list = new ArrayList<>();
for(int i=0;i<css.length;i++){
list.add(Url2Get(opener, url, code, css[i], child[i], value[i],notest[i]));
}
return list;}
return null;
}//检测用的编码抓的东西对不对 主要是对文本测试 或许这里用正则实现是个不错的主意
private static boolean testlist(String string){
boolean in = false;
for(String str:strs){
if(string.indexOf(str)>0){//判断是否存在 strs中的任何一个 存在就OK 然后跳出
in = true;break;
}
}
return in;}
}
第三步,编写多线程部分:
连接目标站点获取页面使用多个线程获取,内容存储到SQLite数据库时使用单个线程写入(使用lock)
import java.sql.Connection;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.List;
import java.util.concurrent.locks.ReentrantLock;
import xywi.util.HttpOpener;
import xywi.util.Utils;
public class ThreadWork extends Thread {
private ReentrantLock lock;
private Connection con = null;
private HttpOpener opener = null;
private String url = null;
private String code = null;
private String[] css = null;
private String[] value = null;
private String bookno = null;
private String[] child = null;
private boolean[] notest = null;
public ThreadWork(Connection con, HttpOpener opener, String url,
String code, String[] css, String[]child, String[] value, boolean[] notest,String bookno) {
super();
this.lock = new ReentrantLock();
this.con = con;
this.opener = opener;
this.url = url;
this.code = code;
this.css = css;
this.child = child;
this.value = value;
this.notest = notest;
this.bookno = bookno;
}
public void run(){//从这里开始的内容都会以多线程的方式运行
Integer urlnum = Integer.parseInt(url.substring(url.lastIndexOf("/")+1, url.lastIndexOf(".")));
System.out.println("Downloading "+url+" ...");
List<List<String>> list = Utils.Url2Get(opener, url, code, css, child, value, notest);
System.out.println("One Get!");
String title = list.get(0).get(0).replace("\"", "“");//防止写入数据库出错 把半角双引号全部换掉
String text = list.get(1).get(0).replace("\"", "“").replace("比如广告之类的东西", "");//根据数组的长度可以选择list更多元素
try{
lock.lock();//这里开始的内容每次只有一个线程运行 其他线程堵塞
Statement statement = this.con.createStatement();
String sql = "insert into txt"+bookno+" values("+urlnum+",\""+title+"\",\""+text+"\");";
statement.executeUpdate(sql);
}catch(SQLException e){
e.printStackTrace();
}finally{
lock.unlock();//其他线程可以争夺lock了
}
}
}
对于单核CPU运行计算密集型多线程,性能会下降,但对于IO密集型多线程(本例),性能会上升(限制你的是网速和对方网站的响应速度,当然你要是开9999个线程当我没说……)。
最后就是程序的入口和逻辑了
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.sqlite.SQLiteException;
import xywi.util.HttpOpener;
import xywi.util.Utils;
public class Main {
static String jdbc = "org.sqlite.JDBC";//sqlite jdbc
static String url = "jdbc:sqlite:local.db";//sqlite 数据库文件路径
public static void main(String[] args) throws Exception {
String bookno = args[0];//带参运行
HttpOpener opener = HttpOpener.getOpener();//opener 实例
final String Baseurl = "";//网站首页 跟路径
String indexurl = Baseurl +"Book/"+bookno+"/Index.aspx";//某本书的主页 介绍页
List<String> list = Utils.Url2Get(opener, indexurl, "GBK", "div#CrbsButton", "a", "href",true);//抓取章节列表页url opener实例 当前页面 编码方式 定位元素 获得元素 获得属性 不做测试
String listurl = Utils.Url2Url(Baseurl, indexurl, list.get(0));//URL转换 list的大小实际上为1
list = Utils.Url2Url(Baseurl,listurl,Utils.Url2Get(opener, listurl, "GBK", "div#BookText ul li", "a", "href",true));//抓取章节列表中的url 对应每一章
Class.forName(jdbc);
Connection con = DriverManager.getConnection(url);
Statement statement = con.createStatement();
boolean restart = false;//要是数据库已经有了这本书 我就不下载了
ResultSet rs;
try {
rs = statement.executeQuery("select count(1) as sum from txt"+bookno+" ;");
rs.next();
restart = Integer.parseInt(rs.getString("sum"))<list.size();//数据库里的章节数目 与 页面上抓到的章节数目
} catch (SQLiteException e) {//没有这张表 ……
restart = true;
}
if(restart){
statement.executeUpdate("drop table if exists txt"+bookno+";");
statement.executeUpdate("create table txt"+bookno+"(url Integer not null,title text not null,txt text not null);");
//多线程池
ExecutorService server = Executors.newFixedThreadPool(64);
for(String url:list){//提交线程 运行
server.execute(new ThreadWork(con, opener, url, null, new String[]{"div#TextTitle","div#BookTextt"},new String[]{"span",null},new String[]{null,null},new boolean[]{false,false},bookno));
}
server.shutdown();//终止提交
while(true){
if(server.isTerminated()){//线程池空了?
System.out.println("All Download Over!");
break;
}
}
}
rs = statement.executeQuery("select title,txt,url from txt"+bookno+" order by url ASC;");
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(new File(bookno+".txt"),true));
while(rs.next()){
String title = rs.getString("title") + "\n";
bos.write(title.getBytes());
String text = rs.getString("txt") + "\n\n";
bos.write(text.getBytes());
}
bos.close();
rs.close();
statement.close();
con.close();
System.out.println("All Over!");
}
}
这里使用了 java.util.concurrent 下的线程池进行多线程的启动,这里的线程池只是限定了同时运行的线程的大小,并不是将线程重复利用。
在python版本里,我是将章节连接分给了n个线程,然后所有线程互不干扰运行(操作数据库那儿除外),然后等到全部结束后,主线程继续;
在java版本里,程序创建了章节数个线程(即每一章节一个线程),但这些线程受线程池的限制,同一时间只有线程池大小那么多个线程在运行(数据库操作除外),其他的线程都被堵塞,每当有一个线程运行结束移出线程池,就会有一个线程进入线程池开始运行。而在这个过程中,主线程一直在运行。
将项目打包Jar 后 java -jar FileName.jar bookNo 即可。(路人:别人写了半年,你十分钟down下来了,好么?! 我:表打我……)