在学习JAVA的过程中写的一个程序,处理上还是有许多问题,爬简单的页面还行,复杂的就要跪.
爬取内容主要使用URLConnection请求获得页面内容,使用正则匹配页面内容获得所需的信息存入文件,使用正则寻找这个页面中可访问的URL,使用队列存储未访问的URL和已访问的URL。另外,由于没有用到第三方包,所以不需要去下载新的jar包,如果遇到报错,可能是快捷导入的包错了,改过来就好了。
还是直接上代码吧。
public class MainTest {
/*
* author:luo bangliu
* SCUT
* */
public static void main(String[] args) {
// TODO Auto-generated method stub
String url="http://www.zhihu.com/explore/recommendations";
String result=Spider.SendGet(url);
}
Spider.getTarget(result);
}
}
public class Spider {
static String getEdit="question_link.+?>(.+?)<";
static String getUrl="<h2>.+?question_link.+?href=\"(.+?)\".+?</h2>";
static String getDeepUrl="question_link.+?href=\"(.+?)\"";
static String SendGet(String url){
String result="";
System.out.println("SendGet正在抓取:"+url);
BufferedReader in=null;
try{
URL realUrl=new URL(url);
URLConnection connection=realUrl.openConnection();
connection.connect();
in=new BufferedReader(new InputStreamReader(
connection.getInputStream(),"UTF-8"));
String line;
while((line=in.readLine())!=null)
{
result+=line;
}
}
catch (Exception e) {
// TODO: handle exception
System.out.println("Spider get error:"+url);
e.printStackTrace();
}
finally{
try{
if(in!=null)
in.close();
}catch(Exception e)
{
e.printStackTrace();
}
}
return result;
}
static ArrayList<Zhihu> getRec(String content)
{
ArrayList<Zhihu>result=new ArrayList<Zhihu>();
Zhihu zhihuTmp=new Zhihu();
Pattern pattern=Pattern.compile(getUrl);
Matcher matcher=pattern.matcher(content);
while(matcher.find())
{
zhihuTmp.init(matcher.group(1));
result.add(zhihuTmp);
}
return result;
}
static String getRealUrl(String url)
{
String ret = "http://www.zhihu.com/explore/recommendations";
Pattern pattern=Pattern.compile("question/(.*?)/");
Matcher matcher=pattern.matcher(url);
if(matcher.find())
ret="http://www.zhihu.com/question/"+matcher.group(1);
else if(url.length()<25){
ret="http://www.zhihu.com"+url;
}
return ret;
}
static void getTarget(String content){
SpiderQueue queue=new SpiderQueue();
Pattern pattern=Pattern.compile(getUrl);
Matcher matcher=pattern.matcher(content);
Zhihu tmp=new Zhihu();
while(matcher.find())
{
queue.addUnvisitedUrl(matcher.group(1));
System.out.println("from recommendations:"+matcher.group(1));
}
while(!queue.unVisitedUrlsEmpty())
{
String url=(String) queue.unVisitedUrlDequeue();
queue.addVisiteUrl(url);
tmp.init(url);
url=getRealUrl(url);
String c=SendGet(url);
Pattern p=Pattern.compile(getDeepUrl);
Matcher m=p.matcher(c);
if(queue.getUnVisitedUrlNum()<=1000)
while(m.find()){
System.out.println("get url from:"+m.group(1)+" number:"+queue.getUnVisitedUrlNum());
queue.addUnvisitedUrl(m.group(1));
}
System.out.println("the loop :"+url);
FileReaderWriter.writeIntoFile(tmp.writeString(),
"E:/test.txt", true); //将获得的数据写入文件中
}
System.out.println("queue is empty"+queue.getVisitedUrlNum());
}
}
public class FileReaderWriter {
public static boolean createNewFile(String filePath){
boolean ifSuccess=true;
String filePathTurn=filePath.replaceAll("\\\\","/");
int index=filePathTurn.lastIndexOf("/");
String dir=filePathTurn.substring(0,index);
File fileDir=new File(dir);
ifSuccess=fileDir.mkdirs();
File file=new File(filePathTurn);
try{
ifSuccess=file.createNewFile();
}catch(IOException e)
{
ifSuccess=false;
e.printStackTrace();
}
return ifSuccess;
}
public static boolean writeIntoFile(String content,String filePath,boolean ifAppend){
boolean ifSuccess=true;
int index=filePath.lastIndexOf("/");
String dir=filePath.substring(0,index);
File fileDir=new File(dir);
fileDir.mkdirs();
File file=null;
try{
file = new File(filePath);
file.createNewFile();
} catch (IOException e) {
ifSuccess = false;
e.printStackTrace();
}
FileWriter fileWriter=null;
try{
fileWriter=new FileWriter(file,ifAppend);
fileWriter.write(content);
fileWriter.flush();
}catch(IOException e)
{
ifSuccess=false;
e.printStackTrace();
}
finally{
try {
if(fileWriter!=null)
fileWriter.close();
} catch (Exception e) {
// TODO: handle exception
e.printStackTrace();
}
}
return ifSuccess;
}
}
public class SpiderQueue {
private static Set<Object>visitedUrl=new HashSet<>();
private static Queue unVisitedUrl=new Queue();
public void addVisiteUrl(String Url){
visitedUrl.add(Url);
}
public void removeVisitedUrl(String url){
visitedUrl.remove(url);
}
public int getVisitedUrlNum(){
return visitedUrl.size();
}
public Object unVisitedUrlDequeue(){
return unVisitedUrl.deQueue();
}
public void addUnvisitedUrl(String url){
if(url!=null&&!url.trim().equals("")&&!visitedUrl.contains(url)
&&!unVisitedUrl.contians(url)){
unVisitedUrl.enQueue(url);
System.out.println("add to list success"+url);
}
else if(url==null){
System.out.println("url=null");
}
else if(url.trim().equals("")){
System.out.println("url equals null");
}
else if(visitedUrl.contains(url)){
System.out.println("vistedList alearld have");
}
else if(unVisitedUrl.contians(url)){
System.out.println("unVisitedList alearld have");
}
else
System.out.println("something happened");
}
public boolean unVisitedUrlsEmpty(){
return unVisitedUrl.empty();
}
public int getUnVisitedUrlNum(){
return unVisitedUrl.getNum();
}
}
public class Zhihu {
//获得标题的正则
public static String getQuestion="zh-question-title.+?<h2.+?>(.+?)</h2>";
//获得描述的正则
public static String getDetail="zh-question-detail.+?<div.+?>(.*?)</div>";
//获得答案的正则
public static String getAnswer="data-author-name=\"(.+?)\".+?<div.+?>(.+?)</div>";
public String question;
public String zhihuUrl;
public ArrayList<String> authorName;
public ArrayList<String> answers;
public String questionDesc;
public void init(String Url)
{
question="";
zhihuUrl="";
answers=new ArrayList<String>();
authorName=new ArrayList<String>();
questionDesc="";
try {
if(getRealUrl(Url)){
String content=Spider.SendGet(zhihuUrl);
System.out.println("zhihu spider begin:"+zhihuUrl);
Pattern pattern;
Matcher matcher;
if(content!=null){
pattern=Pattern.compile(getQuestion);
matcher=pattern.matcher(content);
if(matcher.find())
question=matcher.group(1);
else{
question="lost";
System.out.println("lost question:"+Url);
}
pattern=Pattern.compile(getDetail);
matcher=pattern.matcher(content);
if(matcher.find())
questionDesc=matcher.group(1);
else{
questionDesc="lost";
System.out.println("lost questionDesc:"+Url);
}
pattern=Pattern.compile(getAnswer);
matcher=pattern.matcher(content);
while(matcher.find())
{
authorName.add(matcher.group(1));
answers.add(matcher.group(2));
}
}
}
} catch (Exception e) {
// TODO: handle exception
System.out.println("zhihu class error");
e.printStackTrace();
}
}
public Zhihu(){
question="";
zhihuUrl="";
answers=new ArrayList<String>();
authorName=new ArrayList<String>();
questionDesc="";
//
//System.out.println("Zhihu类正在抓取:"+zhihuUrl);
}
public boolean getAll() {
return true;
}
@Override
public String toString()
{
return "question:"+question+"\n description:"+questionDesc+"\n link:"+zhihuUrl
+"\n answer:"+answers+"\n";
}
public boolean getRealUrl(String url)
{
Pattern pattern=Pattern.compile("question/(.*?)/");
Matcher matcher=pattern.matcher(url);
if(matcher.find())
zhihuUrl="http://www.zhihu.com/question/"+matcher.group(1);
else if(url.length()<25){
zhihuUrl="http://www.zhihu.com"+url;
}
else
return false;
return true;
}
public String writeString() {
String result = "";
result += "问题:" + question + "\r\n";
result += "描述:" + questionDesc+ "\r\n";
result += "链接:" + zhihuUrl + "\r\n";
for (int i = 0; i < answers.size(); i++) {
result += "作者" + i + ":" + authorName.get(i) + "\r\n";
result += "回答" + i + ":" + answers.get(i) + "\r\n";
}
result += "\r\n\r\n";
result = result.replaceAll("<br>", "\r\n");
result = result.replaceAll("<.*?>", "");
return result;
}
}
public class Queue {
private LinkedList<Object> queue = new LinkedList<Object>();
public void enQueue(Object t){
queue.addLast(t);
}
public Object deQueue(){
return queue.removeFirst();
}
public boolean isQueueEmpty(){
return queue.isEmpty();
}
public boolean contians(Object t){
return queue.contains(t);
}
public boolean empty() {
return queue.isEmpty();
}
public int getNum()
{
return queue.size();
}
}