python写爬虫4-多线程爬虫(采集58出租房信息)

最新推荐文章于 2024-10-07 13:56:59 发布

皓阳当空

最新推荐文章于 2024-10-07 13:56:59 发布

阅读量4.5k

点赞数 1

分类专栏：爬虫文章标签： python 多线程爬虫

本文链接：https://blog.csdn.net/apple9005/article/details/54971151

版权

爬虫专栏收录该内容

9 篇文章 0 订阅

订阅专栏

python写爬虫4-多线程爬虫(采集58出租房信息)

本文代码是在【python写爬虫3-MongoDB数据缓存(采集58出租房信息)】http://blog.csdn.net/apple9005/article/details/54967916博文的基础上编写

运行【python写爬虫3-MongoDB数据缓存(采集58出租房信息)】中的代码爬取信息，你会发现，爬取一个列表页的详细数据（大约150条左右）会需要一段时间。可见串行爬虫的爬取效率并不高，如何降低爬取数据所花费的时间呢？本文使用多线程来降低爬取时间。

1.先来一个Python的线程小事例

直接使用python的内置模块threading

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import threading
from time import ctime, sleep


def music(func):
    for i in range(2):
        print 'I was listening to %s.%s' % (func, ctime())
        sleep(1)

def move(func):
    for i in range(2):
        print "I was at the %s!%s" % (func, ctime())
        sleep(5)

threads = []
t1 = threading.Thread(target=music, args=(u'爱情买卖',))
threads.append(t1)
t2 = threading.Thread(target=move, args=(u'阿凡达',))
threads.append(t2)

if __name__ == '__main__':

    for t in threads:
        # t.setDaemon(True)  # 设为守护进程
        t.start()

    print 'All over %s' % ctime()

未设为守护进程，执行结果如下：
这里写图片描述
设为守护进程，执行结果如下：

2.多线程爬虫

#! /usr/bin/env python
# -*- coding:utf-8 -*-

import urllib2
import lxml.html
import time
from lxml.cssselect import CSSSelector
from MongoCache import MongoCache
import threading


def download(url, user_agent='Google', num_retries=2):
    """下载整个页面"""
    print 'Downloading:', url

    # 设置用户代理
    headers = {'User-agent': user_agent}
    request = urllib2.Request(url, headers=headers)
    try:
        html = urllib2.urlopen(request).read()
    except urllib2.URLError as e:
        print 'Downloading error:', e.reason
        html = None

        # 只有在服务器报500-600错误时，才会重试下载，仅重试2次
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                return download(url, num_retries-1)
    return html


def get_data(url):
    """从详细页面 获取各字段数据"""

    #  如果缓存中有该页面数据，则直接获取使用；否则，先下载页面，再使用
    cache = MongoCache()
    if not cache.__getitem__(url):
        html_text_detail = download(url)
        if not html_text_detail:
            print 'None:', url
        else:
            cache.__setitem__(url, html_text_detail)
    else:
        print 'Exists:', url
        html_text_detail = cache.__getitem__(url)

    try:
        #  获取个字段数据
        tree = lxml.html.fromstring(html_text_detail)

        house_title = CSSSelector('div.main-wrap > div.house-title > h1')
        house_pay_way1 = CSSSelector('div.house-pay-way > span:nth-child(1)')
        house_pay_way2 = CSSSelector('div.house-pay-way > span:nth-child(2)')
        print house_title(tree)[0].text_content()
        print '%s|%s' % (house_pay_way1(tree)[0].text_content(), house_pay_way2(tree)[0].text_content())

        for i in range(7):
            for j in range(2):
                css = 'div.house-desc-item > ul.f14 > li:nth-child(%s) > span:nth-child(%s)' % (i+1, j+1)
                house_info = CSSSelector(css)
                print house_info(tree)[0].text_content().replace(' ', '')

    except TypeError as e:
        print 'HTML文本发生错误：%s' % e
    except IndexError as e:
        print '获取详细数据发生错误：%s' % e


def get_url(html):
    """获取需爬取数据的链接集"""
    tree = lxml.html.fromstring(html)
    sel = CSSSelector('div.mainbox > div.main > div.content > div.listBox > ul.listUl > li > div.des > h2 > a')
    url_list = []
    for i in sel(tree):
        if i.get('href') not in url_list:
            url_list.append(i.get('href'))
    return url_list


if __name__ == '__main__':
    url_index = 'http://bj.58.com/chuzu/'
    html_text_list = download(url_index)
    url_list = get_url(html_text_list)

    for url_detail in url_list:
        thr = threading.Thread(target=get_data, args=(url_detail,))
        thr.start()
        print '-------------------Thread Name: %s----------------' % thr.getName()
        time.sleep(2)  # 延时2s,如果注释这代码，运行会很快结束。