scraping_编写第一个网络爬虫_最终版本

最新推荐文章于 2024-07-15 16:20:45 发布

Pop_Rain

最新推荐文章于 2024-07-15 16:20:45 发布

阅读量1.5k

点赞数 2

分类专栏： python

本文链接：https://blog.csdn.net/Pop_Rain/article/details/72528355

版权

python 专栏收录该内容

22 篇文章 1 订阅

订阅专栏

以下是自己学习到的第一个网络爬虫，是自己写与实例版本的对比

1.自己学习写的最终版本

import urllib.request
import urllib.error 
import re #正则表达式
import urllib.parse #将url链接从相对路径（浏览器可懂但python不懂）转为绝对路径（python也懂了）
import urllib.robotparser #爬取数据前解析网站robots.txt文件，避免爬取网站所禁止或限制的
import datetime  #下载限速功能所需模块
def download(url, user_agent = "brain", proxy = None, num_retries = 2):  #下载url网页,proxy是支持代理功能，初始值为None，想要设置就直接传参数即可
    print("downloading：",url)
    header = {"user-agent": user_agent} #设置用户代理，而不使用python默认的用户代理Python-urllib/3.6
    req = urllib.request.Request(url, headers = header)    
    
    opener = urllib.request.build_opener()  #为支持代理功能时刻准备着
    if proxy:   #如果设置了proxy，那么就进行以下设置以实现支持代理功能
        proxy_params = { urllib.parse.urlparse(url).scheme: proxy }
        opener.add_handler(urllib.request.ProxyHandler(proxy_params))
        response = opener.open(req)
        
    try:
        html = urllib.request.urlopen(req).read()
    except urllib.error.URLError as e:    #下载过程中出现问题
        print("download error：",e.reason)
        html = None

        if num_retries > 0:     #错误4XX发生在请求存在问题，而5XX错误则发生在服务端存在问题，所以在发生5XX错误时重试下载
            if hasattr(e, "code") and 500<= e.code <600:
                return  download(url, user_agent, num_retries-1)  # recursively retry 5XX HTTP errors
    return html
#download("http://example.webscraping.com") #访问正常
#download("http://httpstat.us/500") #这个网页测试用，一直是5XXerror

#跟踪链接的爬虫
#link_crawler()函数传入两个参数：要爬取的网站URL、用于跟踪链接的正则表达式。
def link_crawler(seed_url, link_regex, max_depth=2):
    """先下载 seed_url 网页的源代码，然后提取出里面所有的链接URL，接着对所有匹配到的链接URL与link_regex 进行匹配，
如果链接URL里面有link_regex内容，就将这个链接URL放入到队列中，
下一次 执行 while crawl_queue: 就对这个链接URL 进行同样的操作。
反反复复，直到 crawl_queue 队列为空，才退出函数。"""
    crawl_queue = [seed_url]
    max_depth = 2 #为避免爬虫陷阱，将用于避免重复链接的seen记录值修改为字典，增加记录访问次数；如果想要禁用该功能，只需将max_depth设为一个负数即可，此时当前深度永远不会与之相等
    seen = {seed_url:0} #初始化seed_url访问深度为0
    
    #seen = set(crawl_queue) #有可能链接中互相重复指向，为避免爬取相同的链接，所以我们需要记录哪些链接已经被爬取过(放在集合seen中)，若已被爬取过，不再爬取
    while crawl_queue:
        url = crawl_queue.pop()
        
        rp = urllib.robotparser.RobotFileParser()   #爬取前解析网站robots.txt，检查是否可以爬取网站，避免爬取网站禁止或限制的
        rp.set_url("http://example.webscraping.com/robots.txt")
        rp.read()
        user_agent = "brain"
        if rp.can_fetch(user_agent, url):  #解析后发现如果可以正常爬取网站，则继续执行
            
            #爬取网站的下载限速功能的类的调用，每次在download下载前使用
            throttle = Throttle(delay=5) #这里实例网站robots.txt中的delay值为5
            throttle.wait(url)
            html = download(url)   #html = download(url, hearders, proxy=proxy, num_retries=num_retries)这里可以传所需要的参数
            
            html = str(html)
            #filter for links matching our regular expression
            if html == None:
                continue

            depth = seen[url]  #用于避免爬虫陷阱的记录爬取深度的depth
            if depth != max_depth:
                for link in get_links(html):
                    if re.match(link_regex, link):
                        link = urllib.parse.urljoin(seed_url, link) #把提取的相对url路径link(view/178)转化成绝对路径(/view/Poland-178)link
                        if link not in seen:  #判断是否之前已经爬取
                            seen[link] = depth + 1 #在之前的爬取深度上加1
                            crawl_queue.append(link) #之前没有的话这个链接可用，放在列表中继续进行爬取
        else:
            print("Blocked by %s robots,txt" % url)
            continue
        
def get_links(html):
    """用来获取一个html网页中所有的链接URL"""
    #做了一个匹配模板 webpage_regex，匹配 <a href="xxx"> or <a href='xxx'>这样的字符串，并提取出里面xxx的URL，请注意这里的xxxURL很可能是源码中相对路径，eg view/1 正常访问肯定是打不开的
    webpage_regex = re.compile('<a href=["\'](.*?)["\']', re.IGNORECASE)
    return re.findall(webpage_regex,html)
    #return re.findall('<a[^>]+href=["\'](.*?)["\']', html)也可以这样实现，但没有上面的先编译模板再匹配好

class Throttle:  #爬取网站的下载限速功能的类的实现，每次在download下载前使用
    """Add a delay between downloads to the same domain"""
    def __init__(self, delay):
        self.delay = delay  # value of delay between downloads for each domain
        self.domains = {}   # timestamp of when a domain was last accessed记录上次访问的时间，小知识timestamp：时间戳是指格林威治时间1970年01月01日00时00分00秒(北京时间1970年01月01日08时00分00秒)起至现在的总秒数。

    def wait(self, url):
        domain = urllib.parse.urlparse(url).netloc
        last_accessed = self.domains.get(domain)

        if self.delay>0 and last_accessed is not None:
            sleep_secs = self.delay - (datetime.datetime.now() - last_accessed).seconds
            if sleep_secs > 0:
                time.sleep(sleep_secs)  #domain has been accessed recently,so need to sleep
        self.domains[domain] = datetime.datetime.now()

#只想找http://example.webscraping.com/index... or http://example.webscraping.com/view...
link_crawler("http://example.webscraping.com", "/(index|view)")

2.示例网站提供的最终版本（阅读就好，示例代码是用python2实现的）

import re
import urlparse
import urllib2
import time
from datetime import datetime
import robotparser
import Queue


def link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, headers=None, user_agent='wswp', proxy=None, num_retries=1):
    """Crawl from the given seed URL following links matched by link_regex
    """
    # the queue of URL's that still need to be crawled
    crawl_queue = Queue.deque([seed_url])
    # the URL's that have been seen and at what depth
    seen = {seed_url: 0}
    # track how many URL's have been downloaded
    num_urls = 0
    rp = get_robots(seed_url)
    throttle = Throttle(delay)
    headers = headers or {}
    if user_agent:
        headers['User-agent'] = user_agent

    while crawl_queue:
        url = crawl_queue.pop()
        # check url passes robots.txt restrictions
        if rp.can_fetch(user_agent, url):
            throttle.wait(url)
            html = download(url, headers, proxy=proxy, num_retries=num_retries)
            links = []

            depth = seen[url]
            if depth != max_depth:
                # can still crawl further
                if link_regex:
                    # filter for links matching our regular expression
                    links.extend(link for link in get_links(html) if re.match(link_regex, link))

                for link in links:
                    link = normalize(seed_url, link)
                    # check whether already crawled this link
                    if link not in seen:
                        seen[link] = depth + 1
                        # check link is within same domain
                        if same_domain(seed_url, link):
                            # success! add this new link to queue
                            crawl_queue.append(link)

            # check whether have reached downloaded maximum
            num_urls += 1
            if num_urls == max_urls:
                break
        else:
            print 'Blocked by robots.txt:', url


class Throttle:
    """Throttle downloading by sleeping between requests to same domain
    """
    def __init__(self, delay):
        # amount of delay between downloads for each domain
        self.delay = delay
        # timestamp of when a domain was last accessed
        self.domains = {}
        
    def wait(self, url):
        domain = urlparse.urlparse(url).netloc
        last_accessed = self.domains.get(domain)

        if self.delay > 0 and last_accessed is not None:
            sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
            if sleep_secs > 0:
                time.sleep(sleep_secs)
        self.domains[domain] = datetime.now()


def download(url, headers, proxy, num_retries, data=None):
    print 'Downloading:', url
    request = urllib2.Request(url, data, headers)
    opener = urllib2.build_opener()
    if proxy:
        proxy_params = {urlparse.urlparse(url).scheme: proxy}
        opener.add_handler(urllib2.ProxyHandler(proxy_params))
    try:
        response = opener.open(request)
        html = response.read()
        code = response.code
    except urllib2.URLError as e:
        print 'Download error:', e.reason
        html = ''
        if hasattr(e, 'code'):
            code = e.code
            if num_retries > 0 and 500 <= code < 600:
                # retry 5XX HTTP errors
                return download(url, headers, proxy, num_retries-1, data)
        else:
            code = None
    return html


def normalize(seed_url, link):
    """Normalize this URL by removing hash and adding domain
    """
    link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
    return urlparse.urljoin(seed_url, link)


def same_domain(url1, url2):
    """Return True if both URL's belong to same domain
    """
    return urlparse.urlparse(url1).netloc == urlparse.urlparse(url2).netloc


def get_robots(url):
    """Initialize robots parser for this domain
    """
    rp = robotparser.RobotFileParser()
    rp.set_url(urlparse.urljoin(url, '/robots.txt'))
    rp.read()
    return rp
        

def get_links(html):
    """Return a list of links from html 
    """
    # a regular expression to extract all links from the webpage
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    # list of all links from the webpage
    return webpage_regex.findall(html)


if __name__ == '__main__':
    link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, user_agent='BadCrawler')
    link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, max_depth=1, user_agent='GoodCrawler')

3.测试此爬虫

我们可以将用户代理设置为BadCrawler，也就是本章前文所述的被robots.txt 屏蔽了的那个用户代理。从下面的运行结果中可以看出，爬虫果然被屏蔽了，代码启动后马上就会结束：

>>>seed_url ＝ "http://example.webscraping.com/index"
>>>link_regex = "/(index/view)"
>>>link_crawler(seed_url, link_regex, user_agent＝"BadCrawler")
Blocked by robots.txt : http://example.webscraping.com/

现在，让我们使用默认的用户代理，并将最大深度设置为1，这样只有主页上的链接才会被下载：

downloading： http://example.webscraping.com
downloading： http://example.webscraping.com/index/1
downloading： http://example.webscraping.com/index/2
downloading： http://example.webscraping.com/index/0
downloading： http://example.webscraping.com/view/Barbados-20
downloading： http://example.webscraping.com/view/Bangladesh-19
downloading： http://example.webscraping.com/view/Bahrain-18
downloading： http://example.webscraping.com/view/Bahamas-17
download error： TOO MANY REQUESTS
downloading： http://example.webscraping.com/view/Azerbaijan-16
downloading： http://example.webscraping.com/view/Austria-15
downloading： http://example.webscraping.com/view/Australia-14
downloading： http://example.webscraping.com/view/Aruba-13
downloading： http://example.webscraping.com/view/Armenia-12
downloading： http://example.webscraping.com/view/Argentina-11
downloading： http://example.webscraping.com/view/Antigua-and-Barbuda-10
download error： TOO MANY REQUESTS
downloading： http://example.webscraping.com/view/Antarctica-9
download error： TOO MANY REQUESTS
downloading： http://example.webscraping.com/view/Anguilla-8
download error： TOO MANY REQUESTS
downloading： http://example.webscraping.com/view/Angola-7
download error： TOO MANY REQUESTS
downloading： http://example.webscraping.com/view/Andorra-6
download error： TOO MANY REQUESTS
downloading： http://example.webscraping.com/view/American-Samoa-5
download error： TOO MANY REQUESTS
downloading： http://example.webscraping.com/view/Algeria-4
download error： TOO MANY REQUESTS
downloading： http://example.webscraping.com/view/Albania-3
download error： TOO MANY REQUESTS
downloading： http://example.webscraping.com/view/Aland-Islands-2
download error： TOO MANY REQUESTS
downloading： http://example.webscraping.com/view/Afghanistan-1
download error： TOO MANY REQUESTS