为链接爬虫添加缓存支持

最新推荐文章于 2023-08-26 14:20:33 发布

kimiYangfly

最新推荐文章于 2023-08-26 14:20:33 发布

阅读量647

点赞数 1

本文链接：https://blog.csdn.net/ykm18811712927/article/details/72582901

版权

#-*- coding:UTF-8 -*-
#原来创建对象时或者是调用类以外的方法时提示没有定义是因为这些类或方法的位置不应该放在主函数后面，而应该放在主函数前面
import urlparse
import urllib2
import random
import time
from datetime import datetime, timedelta
import socket
import robotparser
import csv
import re
import lxml.html
DEFAULT_AGENT = 'wswp'
DEFAULT_DELAY = 5
DEFAULT_RETRIES = 1
DEFAULT_TIMEOUT = 60

def link_crawler(seed_url, link_regex=None, max_depth=-1, max_urls=-1, user_agent='wswp',  scrape_callback=None, cache=None):
    """Crawl from the given seed URL following links matched by link_regex
    """
    # the queue of URL's that still need to be crawled 双向队列里面存储url
    crawl_queue = [seed_url]
    # the URL's that have been seen and at what depth
    seen = {seed_url: 0}
    # track how many URL's have been downloaded
    num_urls = 0
    rp = get_robots(seed_url)#获取robots.txt
    D = Downloader(DEFAULT_DELAY, DEFAULT_AGENT, None, DEFAULT_RETRIES, DEFAULT_TIMEOUT, None, None)
    while crawl_queue:
        url = crawl_queue.pop()#移除列表中的元素，并且返回该元素的值
        depth = seen[url]
        # check url passes robots.txt restrictions
        if rp.can_fetch(user_agent, url):#确定指定的用户代理是否允许访问网页
            html = D(url)
            links = []
            if scrape_callback:
                links.extend(scrape_callback(url, html) or [])
            depth = seen[url]
            if depth != max_depth:
                # can still crawl further
                if link_regex:
                    # filter for links matching our regular expression
                    links.extend(link for link in get_links(html) if re.match(link_regex, link))

                for link in links:
                    link = normalize(seed_url, link) #返回绝对链接
                    # check whether already crawled this link
                    if link not in seen:
                        seen[link] = depth + 1
                        # check link is within same domain
                        if same_domain(seed_url, link):
                            # success! add this new link to queue
                            crawl_queue.append(link)

            # check whether have reached downloaded maximum
            num_urls += 1
            if num_urls == max_urls:
                break
        else:
            print 'Blocked by robots.txt:', url

def get_robots(url):
    """Initialize robots parser for this domain
    """
    rp = robotparser.RobotFileParser()
    rp.set_url(urlparse.urljoin(url, '/robots.txt'))#绝对链接
    rp.read()
    return rp

def normalize(seed_url, link):
    """Normalize this URL by removing hash and adding domain
    """
    link, _ = urlparse.urldefrag(link)  # remove hash to avoid duplicates urldefrag(url)将url分解成去掉fragment的新url和去掉的fragment的二元组
    return urlparse.urljoin(seed_url, link)#绝对链接

def same_domain(url1, url2):
    """Return True if both URL's belong to same domain
    """
    #将url分解成部件的6元组
    return urlparse.urlparse(url1).netloc == urlparse.urlparse(url2).netloc

def get_links(html):
    """Return a list of links from html
    """
    # a regular expression to extract all links from the webpage
    #re.compile()函数将正则表达式的字符串形式编译为Pattern实例，然后使用Pattern实例处理文本并获得匹配结果（一个Match实例），最后使用Match实例获得信息，进行其他的操作。
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    # list of all links from the webpage
    return webpage_regex.findall(html)

class Downloader:
    def __init__(self, delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=None, num_retries=DEFAULT_RETRIES,
                 timeout=DEFAULT_TIMEOUT, opener=None, cache=None):
        socket.setdefaulttimeout(timeout)
        self.throttle = Throttle(delay)
        self.user_agent = user_agent
        self.proxies = proxies
        self.num_retries = num_retries
        self.opener = opener
        self.cache = cache

    def __call__(self, url):
        result = None
        if self.cache:
            try:
                result = self.cache[url]
            except KeyError:
                # url is not available in cache
                pass
            else:
                if self.num_retries > 0 and 500 <= result['code'] < 600:
                    # server error so ignore result from cache and re-download
                    result = None
        if result is None:
            # result was not loaded from cache so still need to download
            self.throttle.wait(url)
            proxy = random.choice(self.proxies) if self.proxies else None
            headers = {'User-agent': self.user_agent}
            result = self.download(url, headers, proxy=proxy, num_retries=self.num_retries)
            if self.cache:
                # save result to cache
                self.cache[url] = result
        return result['html']

    def download(self, url, headers, proxy, num_retries, data=None):
        print 'Downloading:', url
        request = urllib2.Request(url, data, headers or {})
        opener = self.opener or urllib2.build_opener()
        if proxy:
            proxy_params = {urlparse.urlparse(url).scheme: proxy}
            opener.add_handler(urllib2.ProxyHandler(proxy_params))
        try:
            response = opener.open(request)
            html = response.read()
            code = response.code
        except Exception as e:
            print 'Download error:', str(e)
            html = ''
            if hasattr(e, 'code'):
                code = e.code
                if num_retries > 0 and 500 <= code < 600:
                    # retry 5XX HTTP errors
                    return self._get(url, headers, proxy, num_retries - 1, data)
            else:
                code = None
        return {'html': html, 'code': code}


class ScrapeCallback:
    def __init__(self):
        self.writer = csv.writer(open('countries.csv', 'w'))
        self.fields = ('area', 'population', 'iso', 'country', 'capital',
          'continent', 'tld', 'currency_code', 'currency_name',
          'phone', 'postal_code_format', 'postal_code_regex', 'languages')
        self.writer.writerow(self.fields)

    def __call__(self, url, html):
        if re.search('view', url):
            tree = lxml.html.fromstring(html)
            row = []
            for field in self.fields:
                row.append(tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content())
                self.writer.writerow(row)

class Throttle:
    """Throttle downloading by sleeping between requests to same domain
    """

    def __init__(self, delay):
        # amount of delay between downloads for each domain
        self.delay = delay
        # timestamp of when a domain was last accessed
        self.domains = {}

    def wait(self, url):
        """Delay if have accessed this domain recently
        """
        domain = urlparse.urlsplit(url).netloc
        last_accessed = self.domains.get(domain)
        if self.delay > 0 and last_accessed is not None:
            sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
            if sleep_secs > 0:
                time.sleep(sleep_secs)
        self.domains[domain] = datetime.now()


if __name__ == '__main__':
    link_crawler('http://example.webscraping.com', '/(index|view)', scrape_callback=ScrapeCallback())
C:\Anaconda\python.exe C:/Users/Administrator/PythonWorkSpace/python04.py
Downloading: http://example.webscraping.com
Downloading: http://example.webscraping.com/index/1
Downloading: http://example.webscraping.com/index/2

Process finished with exit code 1

kimiYangfly

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
为链接爬虫添加缓存支持

#-*- coding:UTF-8 -*-#原来创建对象时或者是调用类以外的方法时提示没有定义是因为这些类或方法的位置不应该放在主函数后面，而应该放在主函数前面import urlparseimport urllib2import randomimport timefrom datetime import datetime, timedeltaimport socketimport
复制链接

扫一扫