多线程爬取豆瓣例子

# !/usr/bin/env Python3
# -*- coding: utf-8 -*-
# @Author   : zsc
# @FILE     : 多线程抓取猫眼电影.py
# @Time     : 2019/4/8 10:27
# @Software : PyCharm

#-*- coding: utf-8 -*-
import re
import os
import time
import json
import requests
from multiprocessing import Pool
from requests.exceptions import RequestException


def get_one_page(url):
    '''
    获取网页html内容并返回
    '''
    try:
        # 获取网页html内容
        response = requests.get(url)
        # 通过状态码判断是否获取成功
        if response.status_code == 200:
            return response.text
        return None
    except RequestException as e:
        print("打印错误信息:", e)
        return None


def parse_one_page(html):
    '''
    解析HTML代码,提取有用信息并返回
    '''
    # 正则表达式进行解析
    pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name">'
        + '<a.*?>(.*?)</a>.*?"star">(.*?)</p>.*?releasetime">(.*?)</p>'
        + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
    # 匹配所有符合条件的内容
    items = re.findall(pattern, html)
    for item in items:
        yield {
            'index': item[0],
            'image': item[1],
            'title': item[2],
            'actor': item[3].strip()[3:],
            'time': item[4].strip()[5:],
            'score': item[5] + item[6]
        }


def write_to_file(content):
    '''
    将文本信息写入文件
    '''
    with open('result.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + '\n')
        f.close()


def save_image_file(url, path):
    '''
    保存电影封面
    '''
    ir = requests.get(url)
    if ir.status_code == 200:
        with open(path, 'wb') as f:
            f.write(ir.content)
            f.close()


def main(offset):
    url = 'http://maoyan.com/board/4?offset=' + str(offset)
    html = get_one_page(url)
    # 封面文件夹不存在则创建
    if not os.path.exists('covers'):
        os.mkdir('covers')
    # print(list(parse_one_page(html)))
    for item in parse_one_page(html):
        print(item)
        write_to_file(item)
        save_image_file(item['image'], 'covers/' + '%03d'%int(item['index']) + item['title'] + '.jpg')

if __name__ == '__main__':
    start = time.time()
    print("程序开始时间:", start)
    # 使用多进程提高效率
    pool = Pool()
    # a = [0, 10, 20, 30, 40, 50 , 60, 70, 80,90]
    # for i in a:
    #     main(i)
    # 9.410544395446777    循环时间
    pool.map(main,[i*10 for i in range(10)])
    end = time.time()
    print("程序结束时间:", end)
    print("总时间为: end - start", end - start)
    # 2.1735799312591553   多进程时间

这个爬取的效率还是相当高的, 开了 10 个进程一起抓取, 使用 map函数直接进行, 这些的数据量并不是很大, 但是也能比较的出来, 速度相差4倍。 也就是说 四个小时的东西, 一个小时就可以解决了。

from multiprocessing.dummy import Pool as ThreadPool 的用法

from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
import requests
import re
import sys
import time


def spider(url):
    print(url)
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
    html = requests.get(url, headers=headers) #伪装成浏览器
    selector = etree.HTML(html.text) #将网页html变成树结构,用于xpath
    content = selector.xpath('//figure[@class="post-image "]') #提取figure标签
    for each in content:
        tmp = each.xpath('a/img/@src')#把img标签的src属性提取出来
        pic = requests.get(tmp[0])#访问图片
        print('downloading: ' + tmp[0])
        string = re.search(r'\d+/\d+/(.*?)\\.jpg', str(tmp[0])).group(1) #正则表达式匹配图片名字

        with open('pic2\\'+string+'.jpg', "wb") as f:
            f.write(pic.content)

if __name__ == '__main__':
    pool = ThreadPool(2) #双核电脑
    pool.map(spider, ['http://hotpics.cc/page/' + str(i) for i in range(1,11)])  #多线程工作
    pool.close()
    pool.join()
# !/usr/bin/env Python3
# -*- coding: utf-8 -*-
# @Author   : zsc
# @FILE     : 昆山.py
# @Time     : 2019/5/23 11:24
# @Software : PyCharm
import os
import re
import time
import requests
import pandas as pd
from lxml import etree
from multiprocessing import Pool


def index(df = pd.DataFrame()):
    index_url = "http://www.kshome.com.cn/Ksht/RoomInfo.aspx"
    headers = {
        'Cookie':'ASP.NET_SessionId=3vipldp1m2awav0zm14dplv2',
        'Host':'www.kshome.com.cn',
        'Origin':'http://www.kshome.com.cn',
        'Referer':'http://www.kshome.com.cn/Ksht/RoomInfo.aspx?id=20819',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
    }
    params = {"id" :20819}
    for i in range(1, 14):
        print("当前抓取页码:", i)
        index_res = requests.get(url=index_url, params=params)
        VIEWSTATE = re.findall(r'name="__VIEWSTATE" id="__VIEWSTATE" value="(.*?)" />', index_res.text, re.DOTALL)[0]
        EVENTVALIDATION = re.findall(r'name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="(.*?)" />', index_res.text, re.DOTALL)[0]
        data = {
            '__EVENTTARGET': 'GridView1$_ctl43$btnGo',
            '__EVENTARGUMENT':  '',
            '__VIEWSTATE': VIEWSTATE,
            '__EVENTVALIDATION': EVENTVALIDATION,
            'txtCs': '',
            'txtSh': '',
            'GridView1:_ctl43:txtNewPageIndex': i,
        }
        response = requests.post(url = index_url, headers = headers, params = params, data = data)
        html = etree.HTML(response.text)
        a = html.xpath("//table[@id='GridView1']//tr/td[1]/text()")
        b = html.xpath("//table[@id='GridView1']//tr/td[2]/text()")
        c = html.xpath("//table[@id='GridView1']//tr/td[3]/text()")
        d = html.xpath("//table[@id='GridView1']//tr/td[4]/text()")
        e = html.xpath("//table[@id='GridView1']//tr/td[5]/text()")
        a = a[:len(b)]
        df1= pd.DataFrame([a, b, c, d, e]).T
        df = df.append(df1, ignore_index=True)
    print("___________________________________________")
    df.drop_duplicates(inplace=True)
    return df


def main(number):
    print("AAAAAAAAAAAAAAAAAAAAAAAAAAAA", number)
    if os.path.exists("./all3.xlsx"):
        df = pd.read_excel("./all3.xlsx")
        df = index(df)
    else:
        df = index()
    print(len(df))
    df.to_excel("all3.xlsx")


if __name__ == '__main__':
    start = time.time()
    print("程序开始时间:", start)
    pool = Pool()
    pool.map(main, [i for i in range(10)])
    print("1000000000000")
    pool.close()
    pool.join()
    end = time.time()
    print("程序结束时间:", end)
    print("总时间为: end - start", end - start)


 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值