用JAVA实现一个爬虫,爬取知乎的上的内容(代码已无法使用)

在学习JAVA的过程中写的一个程序,处理上还是有许多问题,爬简单的页面还行,复杂的就要跪.
爬取内容主要使用URLConnection请求获得页面内容,使用正则匹配页面内容获得所需的信息存入文件,使用正则寻找这个页面中可访问的URL,使用队列存储未访问的URL和已访问的URL。另外,由于没有用到第三方包,所以不需要去下载新的jar包,如果遇到报错,可能是快捷导入的包错了,改过来就好了。

还是直接上代码吧。

public class MainTest {
/*
 * author:luo bangliu
 * SCUT
 * */
    public static void main(String[] args) {
        // TODO Auto-generated method stub
        String url="http://www.zhihu.com/explore/recommendations";
        String result=Spider.SendGet(url);
        }
        Spider.getTarget(result);

    }
}

public class Spider {
    static String getEdit="question_link.+?>(.+?)<";
    static String getUrl="<h2>.+?question_link.+?href=\"(.+?)\".+?</h2>";
    static String getDeepUrl="question_link.+?href=\"(.+?)\"";
    static String SendGet(String url){
        String result="";
        System.out.println("SendGet正在抓取:"+url);
        BufferedReader in=null;
        try{
            URL realUrl=new URL(url);
            URLConnection connection=realUrl.openConnection();
            connection.connect();
            in=new BufferedReader(new InputStreamReader(
                    connection.getInputStream(),"UTF-8"));
            String line;
            while((line=in.readLine())!=null)
            {
                result+=line;
            }
        }
        catch (Exception e) {
            // TODO: handle exception
            System.out.println("Spider get error:"+url);
            e.printStackTrace();
        }
        finally{
            try{
                if(in!=null)
                    in.close();
            }catch(Exception e)
            {
                e.printStackTrace();
            }

        }
        return result;
    }
    static ArrayList<Zhihu> getRec(String content)
    {
        ArrayList<Zhihu>result=new ArrayList<Zhihu>();
        Zhihu zhihuTmp=new Zhihu();
        Pattern pattern=Pattern.compile(getUrl);
        Matcher matcher=pattern.matcher(content);
        while(matcher.find())
        {
            zhihuTmp.init(matcher.group(1));
            result.add(zhihuTmp);
        }
        return result;
    }
    static String getRealUrl(String url)
    {
        String ret = "http://www.zhihu.com/explore/recommendations";
        Pattern pattern=Pattern.compile("question/(.*?)/");
        Matcher matcher=pattern.matcher(url);
        if(matcher.find())
            ret="http://www.zhihu.com/question/"+matcher.group(1);
        else if(url.length()<25){
            ret="http://www.zhihu.com"+url;
        }
        return ret;
    }
    static void getTarget(String content){

        SpiderQueue queue=new SpiderQueue();
        Pattern pattern=Pattern.compile(getUrl);
        Matcher matcher=pattern.matcher(content);
        Zhihu tmp=new Zhihu();
        while(matcher.find())
        {
            queue.addUnvisitedUrl(matcher.group(1));
            System.out.println("from recommendations:"+matcher.group(1));
        }
        while(!queue.unVisitedUrlsEmpty())
        {
            String url=(String) queue.unVisitedUrlDequeue();
            queue.addVisiteUrl(url);
            tmp.init(url);

            url=getRealUrl(url);

            String c=SendGet(url);

            Pattern p=Pattern.compile(getDeepUrl);
            Matcher m=p.matcher(c);
            if(queue.getUnVisitedUrlNum()<=1000)
                while(m.find()){
                    System.out.println("get url from:"+m.group(1)+" number:"+queue.getUnVisitedUrlNum());
                    queue.addUnvisitedUrl(m.group(1));
                }

            System.out.println("the loop :"+url);

            FileReaderWriter.writeIntoFile(tmp.writeString(),  
                    "E:/test.txt", true);       //将获得的数据写入文件中
        }

        System.out.println("queue is empty"+queue.getVisitedUrlNum());
    }

}

public class FileReaderWriter {
    public static boolean createNewFile(String filePath){
        boolean ifSuccess=true;
        String filePathTurn=filePath.replaceAll("\\\\","/");
        int index=filePathTurn.lastIndexOf("/");
        String dir=filePathTurn.substring(0,index);
        File fileDir=new File(dir);
        ifSuccess=fileDir.mkdirs();
        File file=new File(filePathTurn);
        try{
            ifSuccess=file.createNewFile();
        }catch(IOException e)
        {
            ifSuccess=false;
            e.printStackTrace();
        }

        return ifSuccess;
    }
    public static boolean writeIntoFile(String content,String filePath,boolean ifAppend){
        boolean ifSuccess=true;
        int index=filePath.lastIndexOf("/");
        String dir=filePath.substring(0,index);
        File fileDir=new File(dir);
        fileDir.mkdirs();
        File file=null;
        try{  
            file = new File(filePath);  
            file.createNewFile();  
        } catch (IOException e) {  
            ifSuccess = false;  
            e.printStackTrace();  
        }  
        FileWriter fileWriter=null;
        try{
            fileWriter=new FileWriter(file,ifAppend);
            fileWriter.write(content);
            fileWriter.flush();
        }catch(IOException e)
        {
            ifSuccess=false;
            e.printStackTrace();
        }
        finally{
            try {
                if(fileWriter!=null)
                    fileWriter.close();
            } catch (Exception e) {
                // TODO: handle exception
                e.printStackTrace();
            }
        }
        return ifSuccess;
    }
}

public class SpiderQueue {
    private static Set<Object>visitedUrl=new HashSet<>();
    private static Queue unVisitedUrl=new Queue();
    public  void addVisiteUrl(String Url){
        visitedUrl.add(Url);
    }
    public  void removeVisitedUrl(String url){
        visitedUrl.remove(url);
    }
    public  int getVisitedUrlNum(){
        return visitedUrl.size();
    }
    public  Object unVisitedUrlDequeue(){
        return unVisitedUrl.deQueue();
    }

    public  void addUnvisitedUrl(String url){
        if(url!=null&&!url.trim().equals("")&&!visitedUrl.contains(url)
                &&!unVisitedUrl.contians(url)){
            unVisitedUrl.enQueue(url);
            System.out.println("add to list success"+url);
        }
        else if(url==null){
            System.out.println("url=null");
        }
        else if(url.trim().equals("")){
            System.out.println("url equals null");
        }
        else if(visitedUrl.contains(url)){
            System.out.println("vistedList alearld have");
        }
        else if(unVisitedUrl.contians(url)){
            System.out.println("unVisitedList alearld have");
        }
        else
            System.out.println("something happened");

    }
    public  boolean unVisitedUrlsEmpty(){
        return unVisitedUrl.empty();
    }
    public int getUnVisitedUrlNum(){
        return unVisitedUrl.getNum();
    }
}

public class Zhihu {
    //获得标题的正则
    public static String getQuestion="zh-question-title.+?<h2.+?>(.+?)</h2>";
    //获得描述的正则
    public static String getDetail="zh-question-detail.+?<div.+?>(.*?)</div>";
    //获得答案的正则
    public static String getAnswer="data-author-name=\"(.+?)\".+?<div.+?>(.+?)</div>";

    public String question;
    public String zhihuUrl;
    public ArrayList<String> authorName;
    public ArrayList<String> answers;
    public String questionDesc;
    public void init(String Url)
    {
        question="";
        zhihuUrl="";
        answers=new ArrayList<String>();
        authorName=new ArrayList<String>();
        questionDesc="";
        try {
            if(getRealUrl(Url)){
                String content=Spider.SendGet(zhihuUrl);
                System.out.println("zhihu spider begin:"+zhihuUrl);
                Pattern pattern;
                Matcher matcher;
                if(content!=null){
                    pattern=Pattern.compile(getQuestion);
                    matcher=pattern.matcher(content);
                    if(matcher.find())
                        question=matcher.group(1);
                    else{
                        question="lost";
                        System.out.println("lost question:"+Url);
                    }
                    pattern=Pattern.compile(getDetail);
                    matcher=pattern.matcher(content);
                    if(matcher.find())
                        questionDesc=matcher.group(1);
                    else{
                        questionDesc="lost";
                        System.out.println("lost questionDesc:"+Url);
                    }
                    pattern=Pattern.compile(getAnswer);
                    matcher=pattern.matcher(content);
                    while(matcher.find())
                    {
                        authorName.add(matcher.group(1));
                        answers.add(matcher.group(2));
                    }
                }

            }
        } catch (Exception e) {
            // TODO: handle exception
            System.out.println("zhihu class error");
            e.printStackTrace();
        }
    }
    public Zhihu(){
        question="";
        zhihuUrl="";
        answers=new ArrayList<String>();
        authorName=new ArrayList<String>();
        questionDesc="";
        //
            //System.out.println("Zhihu类正在抓取:"+zhihuUrl);


    }
    public boolean getAll() {
          return true;
    }
    @Override
    public String toString()
    {
        return "question:"+question+"\n description:"+questionDesc+"\n link:"+zhihuUrl
                +"\n answer:"+answers+"\n";
    }
    public boolean getRealUrl(String url)
    {
        Pattern pattern=Pattern.compile("question/(.*?)/");
        Matcher matcher=pattern.matcher(url);
        if(matcher.find())
            zhihuUrl="http://www.zhihu.com/question/"+matcher.group(1);
        else if(url.length()<25){
            zhihuUrl="http://www.zhihu.com"+url;
        }
        else
            return false;
        return true;
    }
    public String writeString() {  
        String result = "";  
        result += "问题:" + question + "\r\n";  
        result += "描述:" + questionDesc+ "\r\n";  
        result += "链接:" + zhihuUrl + "\r\n";  
        for (int i = 0; i < answers.size(); i++) {
            result += "作者" + i + ":" + authorName.get(i) + "\r\n";  
            result += "回答" + i + ":" + answers.get(i) + "\r\n";  
        }  
        result += "\r\n\r\n";  
        result = result.replaceAll("<br>", "\r\n");  
        result = result.replaceAll("<.*?>", "");
        return result;  
}  

}


public class Queue {
    private LinkedList<Object> queue = new LinkedList<Object>();
    public void enQueue(Object t){
        queue.addLast(t);
    }
    public Object deQueue(){
        return queue.removeFirst();
    }
    public boolean isQueueEmpty(){
        return queue.isEmpty();
    }
    public boolean contians(Object t){
        return queue.contains(t);
    }
    public boolean empty() {
        return queue.isEmpty();
    }
    public int getNum()
    {
        return queue.size();
    }
}
  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 12
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 12
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值