爬取维基百科

最新推荐文章于 2024-08-03 12:52:19 发布

路途…

最新推荐文章于 2024-08-03 12:52:19 发布

阅读量3k

点赞数 1

深度优先的递归爬虫

#!/usr/bin/python
# coding: utf-8

import requests
import re
import time

exist_url = []
news_ids = []
g_writecount = 0

def scrappy(url, depth = 1):
    global g_writecount    
    try:
        headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} 
        r = requests.get("https://en.wikipedia.org/wiki/" + url, headers= headers)
        html = r.text
    except Exception as e: 
        print ('Failed downloading and saving', url)
        print (e)
        exist_url.append(url)
        return None
    
    exist_url.append(url)
    link_list = re.findall('<a href="/wiki/([^:#=<>]*?)".*?</a>',html)
    unique_list = list(set(link_list) - set(exist_url))
    
    for eachone in unique_list:
        g_writecount += 1
        output = "No." + str(g_writecount) + "\t Depth:" +  str(depth) + "\t"+ url + ' -> ' + eachone + '\n'
        #print (output)
        with open('link_12-3.txt', "a+") as f:
            f.write(output)
            f.close()

        if depth < 2:
            scrappy(eachone, depth+1)

scrappy("Wikipedia")

广度优先的多线程爬虫

#!/usr/bin/env python  
#coding=utf-8  
import threading  
import requests
import re  
import time  
g_mutex = threading.Condition()  
g_pages = [] #从中解析所有url链接  
g_queueURL = [] #等待爬取的url链接列表  
g_existURL = [] #已经爬取过的url链接列表    
g_writecount = 0 #找到的链接数

class Crawler:  
    def __init__(self,url,threadnum):   
        self.url=url  
        self.threadnum=threadnum  
        self.threadpool=[]   

    def craw(self):  #爬虫的控制大脑，包括爬取网页，更新队列
        global g_queueURL  
        g_queueURL.append(url)
        depth=1
        while(depth < 3):    
            print ('Searching depth ',depth,'...\n')  
            self.downloadAll()  
            self.updateQueueURL()
            g_pages = []
            depth += 1  

    def downloadAll(self): #调用多线程爬虫，在小于线程最大值和没爬完队列之前，会增加线程
        global g_queueURL    
        i=0  
        while i<len(g_queueURL):  
            j=0  
            while j<self.threadnum and i+j < len(g_queueURL):    
                threadresult = self.download(g_queueURL[i+j],j)    
                j+=1  
            i += j  
            for thread in self.threadpool:  
                thread.join(30)  
            threadpool=[]  
        g_queueURL=[]  

    def download(self,url,tid): #调用多线程爬虫 
        crawthread=CrawlerThread(url,tid)  
        self.threadpool.append(crawthread)  
        crawthread.start()  

    def updateQueueURL(self): #完成一个深度的爬虫之后，更新队列
        global g_queueURL  
        global g_existURL  
        newUrlList=[]  
        for content in g_pages:  
            newUrlList+=self.getUrl(content)  
        g_queueURL=list(set(newUrlList)-set(g_existURL))    

    def getUrl(self,content): #从获取的网页中解析url 
        link_list = re.findall('<a href="/wiki/([^:#=<>]*?)".*?</a>',content)
        unique_list = list(set(link_list)) 
        return unique_list  

class CrawlerThread(threading.Thread): #爬虫线程
    def __init__(self,url,tid):  
        threading.Thread.__init__(self)  
        self.url=url  
        self.tid=tid  
    def run(self):  
        global g_mutex    
        global g_writecount  
        try:
            print (self.tid, "crawl ", self.url)
            headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} 
            r = requests.get("https://en.wikipedia.org/wiki/" + self.url, headers= headers)
            html = r.text

            link_list2 = re.findall('<a href="/wiki/([^:#=<>]*?)".*?</a>',html)
            unique_list2 = list(set(link_list2))
            for eachone in unique_list2:
                g_writecount += 1
                content2 = "No." + str(g_writecount) + "\t Thread" +  str(self.tid) + "\t"+ self.url + '->' + eachone +'\n'
                with open('title2.txt', "a+") as f:
                    f.write(content2)
                    f.close()
        except Exception as e: 
            g_mutex.acquire()  
            g_existURL.append(self.url)   
            g_mutex.release()  
            print ('Failed downloading and saving',self.url)
            print (e)
            return None
        g_mutex.acquire()  
        g_pages.append(html)  
        g_existURL.append(self.url)  
        g_mutex.release()

if __name__ == "__main__":
    url = "Wikipedia"
    threadnum = 5
    crawler = Crawler(url,threadnum)  
    crawler.craw()