package com.test;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class cralweBook {
/**
* 一个小小爬虫 (笔趣阁小说网)
* @param url
* @return leiqi
*/
public static List<String> targetList=new ArrayList<String>();
public static List<Map<Integer,Map<String,String>>> endresultlist=new ArrayList<Map<Integer,Map<String,String>>>();
public static Map<Integer,Map<String,String>> endresult=new HashMap<Integer,Map<String,String>>();
/**
* url过滤 防止重复爬取不包括参数位置发生变化的)
* @param url
* @return
*/
public boolean filterUrl(String url){
if(!StringUtils.isBlank(url)){
if(!targetList.contains(url)){
targetList.add(url);
return true;
}
}
return false;
}
/**
*解析页面的方法 页面不相同可以修改此方法
* @param html
* @return
*/
public static int sum=0;
public Map<Integer,Map<String,String>> parsehtml(String url){
if(StringUtils.isBlank(url)){
return null;
}
String html="";
try {
Map map= httpurl(url);
html=(String) map.get("1");
if(!StringUtils.isBlank(html)){
Document doc =Jsoup.parse(html);
String title=doc.select("div.bookname").select("h1").text();
String content=doc.select("div#content").html().replaceAll("<br>", "\r\n").replaceAll(" ", "").replaceAll("<script>chaptererror\\(\\);</script>", "");
// String content=doc.select("div#content").text();只是取得文本信息
Pattern pattern=Pattern.compile("第(.*)章");
Matcher match=pattern.matcher(title);
if(match.find()){
String jj=match.group(1);
int tt=chineseNumber2Int(jj);
Map<String,String> second=new HashMap<String,String>();
second.put("title", title);
second.put("content", content);
endresult.put(tt,second);
}
}
} catch (Exception e) {
// TODO Auto-generated catch block
sum++;
if(sum<4){
return parsehtml(url);
}
sum=0;
//(必要时可以打开 关掉是为了防止发生死循环 目前设置重复请求4次)
//e.printStackTrace();
}
return endresult;
}
/**
* 取得所有符合条件的url
* @param startUrl
* @return
*/
public List<String> getAllurl(String startUrl){
if(StringUtils.isBlank(startUrl)){
return null;
}
List<String> aim=new ArrayList<String>();
String html="";
try {
Map map= httpurl(startUrl);
html=(String) map.get("1");
String domain=(String) map.get("2");
if(!StringUtils.isBlank(html)){
Document doc=Jsoup.parse(html);
Elements ele=doc.select("div#list").select("dd");
if(ele!=null&&ele.size()>0){
for (Element element : ele) {
String uurl=element.select("a").attr("href");
if(!StringUtils.isBlank(uurl)){
String vv="http://"+domain+uurl;
if(filterUrl(vv)){
// if(aim.size()<100){//获取前100章 测试
aim.add(vv);
// }else{
// break;
// }
}
}
}
}
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return aim;
}
/**
* url请求
*/
public static int MAX_REQUEST = 0;
public static Map<String,String> httpurl(String urlStr) throws Exception {
URL url = null;
HttpURLConnection connection = null;
BufferedReader reader = null;
InputStream inputStream = null;
try {
// 建立连接
url = new URL(urlStr);
connection = (HttpURLConnection) url.openConnection();
connection.setDoOutput(true);
connection.setRequestMethod("GET");
connection.setRequestProperty("accept-encoding", "gzip,deflate");
connection.setUseCaches(false);
connection.setConnectTimeout(20000);
connection.setReadTimeout(20000);
connection.setRequestProperty("Referer", urlStr);
connection.setRequestProperty("User-Agent","Mozilla/5.0 (Windows NT 6.3; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0");
connection.setRequestProperty("cookie", " __ysuid=1422503216223lJ1; advideo={\"adv86850_3\": 1, \"adv87191_5\": 1, \"adv87191_6\": 2, \"adv87472_2\": 1, \"adv88186_1\": 2, \"adv88186_4\": 4, \"adv88186_6\": 3, \"adv87469_2\": 2, \"adv87470_2\": 1, \"adv87471_2\": 2, \"adv88187_5\": 4, \"adv88186_2\": 2, \"adv87968_2\": 2, \"adv87968_3\": 2, \"adv88186_3\": 2, \"adv87472_1\": 2}; __tft=1422855207092; __vtft=1422945359306; __hpage_style=0; __ali=14254515646480m0; __aliCount=1; xreferrer=http://www.baidu.com/s?wd=%E8%B6%85%E4%BA%BA%E5%9B%9E%E6%9D%A5%E4%BA%86&rsv_spt=1&issp=1&f=8&rsv_bp=0&rsv_idx=2&ie=utf-8&tn=monline_5_dg&rsv_enter=1&rsv_sug3=3&rsv_sug1=1&rsv_sug2=0&inputT=2665&rsv_sug4=3703; ykss=c8fdfc54135cf3861204f445; u=__LOGOUT__; P_F=1; P_T=1425873387; JSESSIONID=abcqGbHvjt8bF5eaKa9Vu; __utmarea=");
inputStream = connection.getInputStream();
String charset = "UTF-8";
String contentEncoding = connection.getContentEncoding();
if (null == contentEncoding) {
contentEncoding = charset;
}
String headerField = connection.getHeaderField("Content-Type");
if (headerField.contains("charset")) {
String[] content_types = headerField.split("\\;");
for (int i = 0; i < content_types.length; i++) {
if (content_types[i].contains("charset")) {
charset = content_types[i].split("\\=")[1];
break;
}
}
}
// 获得返回结果
if (contentEncoding.contains("gzip")) {
reader = new BufferedReader(new InputStreamReader(
new GZIPInputStream(inputStream), charset));
} else {
reader = new BufferedReader(new InputStreamReader(inputStream,
charset));
}
StringBuffer buffer = new StringBuffer();
String line = reader.readLine();
while (line != null) {
buffer.append(line);
try {
line = reader.readLine();
} catch (Exception e) {
line = null;
}
}
Map map=new HashMap<String,String>();
//获得域名
String domain=url.getHost();
if(!StringUtils.isBlank(domain)){
map.put("1",buffer.toString());
map.put("2",domain);
return map;
}
map.put("1",buffer.toString());
map.put("2","false");
return map;
} catch (IOException e) {
if (e.getMessage().contains("Read timed out")) {
//请求超时的话 继续重新请求
Thread.sleep(2000);
MAX_REQUEST++;
if(MAX_REQUEST<5){
return httpurl(urlStr);
}
MAX_REQUEST=0;
return null;
}
throw e;
} finally {
if (null != reader) {
try {
reader.close();
} catch (IOException e) {
}
}
if (null != inputStream) {
try {
inputStream.close();
} catch (IOException e) {
}
}
if (connection != null) {
connection.disconnect();
}
}
}
/**
* 中文数字转换成阿拉伯数字
* @param chineseNumber
* @return
*/
public int chineseNumber2Int(String chineseNumber){
int sd=0;
try {
sd = Integer.valueOf(chineseNumber);
} catch (NumberFormatException e) {
if(e instanceof NumberFormatException){
int result = 0;
int temp = 1;//存放一个单位的数字如:十万
int count = 0;//判断是否有chArr
char[] cnArr = new char[]{'一','二','三','四','五','六','七','八','九'};
char[] chArr = new char[]{'十','百','千','万','亿'};
for (int i = 0; i < chineseNumber.length(); i++) {
boolean b = true;//判断是否是chArr
char c = chineseNumber.charAt(i);
for (int j = 0; j < cnArr.length; j++) {//非单位,即数字
if (c == cnArr[j]) {
if(0 != count){//添加下一个单位之前,先把上一个单位值添加到结果中
result += temp;
temp = 1;
count = 0;
}
// 下标+1,就是对应的值
temp = j + 1;
b = false;
break;
}
}
if(b){//单位{'十','百','千','万','亿'}
for (int j = 0; j < chArr.length; j++) {
if (c == chArr[j]) {
switch (j) {
case 0:
temp *= 10;
break;
case 1:
temp *= 100;
break;
case 2:
temp *= 1000;
break;
case 3:
temp *= 10000;
break;
case 4:
temp *= 100000000;
break;
default:
break;
}
count++;
}
}
}
if (i == chineseNumber.length() - 1) {//遍历到最后一个字符
result += temp;
}
}
return result;
}
}
return sd;
}
/**
* 输出文件流
* @param fileName
* @param content
*/
public void writefle(String fileName, String content,String encoding) {
// 打开一个随机访问文件流,按读写方式
//RandomAccessFile randomFile = new RandomAccessFile(fileName, "rw");
File file=new File(fileName);
BufferedWriter out=null;
//文件没有就创建 有的话就追加
try {
out=new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file,true),encoding));
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
try {
out.write(content+"\r\n");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally{
try {
out.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
public List<Integer> sortmap(Map<Integer,Map<String,String>> map){
List<Integer> kk=new ArrayList<Integer>();
for(Map.Entry<Integer,Map<String,String>> ll:map.entrySet()){
int hh=ll.getKey();
kk.add(hh);
}
Collections.sort(kk);
return kk;
}
public void execute(String starturl,String outputPath){
final cralweBook ss=new cralweBook();
final List<String> allurl=ss.getAllurl(starturl);//开始的路径 比如电子书的所有章节页
Runnable runnable= new Runnable() {
public void run() {
if(allurl!=null&& allurl.size()>0){
for (String daurl : allurl) {
if(!StringUtils.isBlank(daurl)){
Map<Integer,Map<String,String>> re=ss.parsehtml(daurl);
System.out.println("解析完"+" "+daurl+"=================大小:"+re.size());
}
}
}
}
};
//开启2个线程
Thread sds=new Thread(runnable);
Thread sds1=new Thread(runnable);
//Thread sds2=new Thread(runnable);
sds.start();
sds1.start();
//sds2.start();
try {
sds1.join();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
List<Integer> jij=ss.sortmap(endresult);
int i=1;
for (Integer integer : jij) {
Map<String,String> end =endresult.get(integer);
String rer=end.get("title");
// String pattern="\\s*|\t|\r|\n";//去掉空格 制表符
// String jj=Pattern.compile(pattern).matcher(end.get("content")).replaceAll("");
String content=rer+"\r\n"+end.get("content");
ss.writefle(outputPath, content,"utf8");
System.out.println("写文件中"+" ----------------------- "+i);
i++;
}
}
public static void main(String []args){
cralweBook ss=new cralweBook();
ss.execute("http://www.qu.la/book/4140/", "G://crawler//太古神王.txt");
}
}
最后 再提一句 希望以上代码对读者有所帮助 !! (后续会有socket实现通信的代码 前提是等我研究好了)