多线程爬取豆瓣例子

最新推荐文章于 2020-11-24 07:39:07 发布

疯子vs年华

最新推荐文章于 2020-11-24 07:39:07 发布

阅读量443

点赞数

分类专栏：多进程的使用文章标签：多进程 map函数 yield

本文链接：https://blog.csdn.net/chang995196962/article/details/89085275

版权

多进程的使用专栏收录该内容

3 篇文章 0 订阅

订阅专栏

# !/usr/bin/env Python3
# -*- coding: utf-8 -*-
# @Author   : zsc
# @FILE     : 多线程抓取猫眼电影.py
# @Time     : 2019/4/8 10:27
# @Software : PyCharm

#-*- coding: utf-8 -*-
import re
import os
import time
import json
import requests
from multiprocessing import Pool
from requests.exceptions import RequestException


def get_one_page(url):
    '''
    获取网页html内容并返回
    '''
    try:
        # 获取网页html内容
        response = requests.get(url)
        # 通过状态码判断是否获取成功
        if response.status_code == 200:
            return response.text
        return None
    except RequestException as e:
        print("打印错误信息:", e)
        return None


def parse_one_page(html):
    '''
    解析HTML代码，提取有用信息并返回
    '''
    # 正则表达式进行解析
    pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name">'
        + '<a.*?>(.*?)</a>.*?"star">(.*?)</p>.*?releasetime">(.*?)</p>'
        + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
    # 匹配所有符合条件的内容
    items = re.findall(pattern, html)
    for item in items:
        yield {
            'index': item[0],
            'image': item[1],
            'title': item[2],
            'actor': item[3].strip()[3:],
            'time': item[4].strip()[5:],
            'score': item[5] + item[6]
        }


def write_to_file(content):
    '''
    将文本信息写入文件
    '''
    with open('result.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + '\n')
        f.close()


def save_image_file(url, path):
    '''
    保存电影封面
    '''
    ir = requests.get(url)
    if ir.status_code == 200:
        with open(path, 'wb') as f:
            f.write(ir.content)
            f.close()


def main(offset):
    url = 'http://maoyan.com/board/4?offset=' + str(offset)
    html = get_one_page(url)
    # 封面文件夹不存在则创建
    if not os.path.exists('covers'):
        os.mkdir('covers')
    # print(list(parse_one_page(html)))
    for item in parse_one_page(html):
        print(item)
        write_to_file(item)
        save_image_file(item['image'], 'covers/' + '%03d'%int(item['index']) + item['title'] + '.jpg')

if __name__ == '__main__':
    start = time.time()
    print("程序开始时间:", start)
    # 使用多进程提高效率
    pool = Pool()
    # a = [0, 10, 20, 30, 40, 50 , 60, 70, 80,90]
    # for i in a:
    #     main(i)
    # 9.410544395446777    循环时间
    pool.map(main,[i*10 for i in range(10)])
    end = time.time()
    print("程序结束时间:", end)
    print("总时间为: end - start", end - start)
    # 2.1735799312591553   多进程时间

这个爬取的效率还是相当高的，开了 10 个进程一起抓取，使用 map函数直接进行，这些的数据量并不是很大，但是也能比较的出来，速度相差4倍。也就是说四个小时的东西，一个小时就可以解决了。

from multiprocessing.dummy import Pool as ThreadPool 的用法

from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
import requests
import re
import sys
import time


def spider(url):
    print(url)
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
    html = requests.get(url, headers=headers) #伪装成浏览器
    selector = etree.HTML(html.text) #将网页html变成树结构，用于xpath
    content = selector.xpath('//figure[@class="post-image "]') #提取figure标签
    for each in content:
        tmp = each.xpath('a/img/@src')#把img标签的src属性提取出来
        pic = requests.get(tmp[0])#访问图片
        print('downloading: ' + tmp[0])
        string = re.search(r'\d+/\d+/(.*?)\\.jpg', str(tmp[0])).group(1) #正则表达式匹配图片名字

        with open('pic2\\'+string+'.jpg', "wb") as f:
            f.write(pic.content)

if __name__ == '__main__':
    pool = ThreadPool(2) #双核电脑
    pool.map(spider, ['http://hotpics.cc/page/' + str(i) for i in range(1,11)])  #多线程工作
    pool.close()
    pool.join()

# !/usr/bin/env Python3
# -*- coding: utf-8 -*-
# @Author   : zsc
# @FILE     : 昆山.py
# @Time     : 2019/5/23 11:24
# @Software : PyCharm
import os
import re
import time
import requests
import pandas as pd
from lxml import etree
from multiprocessing import Pool


def index(df = pd.DataFrame()):
    index_url = "http://www.kshome.com.cn/Ksht/RoomInfo.aspx"
    headers = {
        'Cookie':'ASP.NET_SessionId=3vipldp1m2awav0zm14dplv2',
        'Host':'www.kshome.com.cn',
        'Origin':'http://www.kshome.com.cn',
        'Referer':'http://www.kshome.com.cn/Ksht/RoomInfo.aspx?id=20819',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
    }
    params = {"id" :20819}
    for i in range(1, 14):
        print("当前抓取页码：", i)
        index_res = requests.get(url=index_url, params=params)
        VIEWSTATE = re.findall(r'name="__VIEWSTATE" id="__VIEWSTATE" value="(.*?)" />', index_res.text, re.DOTALL)[0]
        EVENTVALIDATION = re.findall(r'name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="(.*?)" />', index_res.text, re.DOTALL)[0]
        data = {
            '__EVENTTARGET': 'GridView1$_ctl43$btnGo',
            '__EVENTARGUMENT':  '',
            '__VIEWSTATE': VIEWSTATE,
            '__EVENTVALIDATION': EVENTVALIDATION,
            'txtCs': '',
            'txtSh': '',
            'GridView1:_ctl43:txtNewPageIndex': i,
        }
        response = requests.post(url = index_url, headers = headers, params = params, data = data)
        html = etree.HTML(response.text)
        a = html.xpath("//table[@id='GridView1']//tr/td[1]/text()")
        b = html.xpath("//table[@id='GridView1']//tr/td[2]/text()")
        c = html.xpath("//table[@id='GridView1']//tr/td[3]/text()")
        d = html.xpath("//table[@id='GridView1']//tr/td[4]/text()")
        e = html.xpath("//table[@id='GridView1']//tr/td[5]/text()")
        a = a[:len(b)]
        df1= pd.DataFrame([a, b, c, d, e]).T
        df = df.append(df1, ignore_index=True)
    print("___________________________________________")
    df.drop_duplicates(inplace=True)
    return df


def main(number):
    print("AAAAAAAAAAAAAAAAAAAAAAAAAAAA", number)
    if os.path.exists("./all3.xlsx"):
        df = pd.read_excel("./all3.xlsx")
        df = index(df)
    else:
        df = index()
    print(len(df))
    df.to_excel("all3.xlsx")


if __name__ == '__main__':
    start = time.time()
    print("程序开始时间:", start)
    pool = Pool()
    pool.map(main, [i for i in range(10)])
    print("1000000000000")
    pool.close()
    pool.join()
    end = time.time()
    print("程序结束时间:", end)
    print("总时间为: end - start", end - start)

疯子vs年华

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
多线程爬取豆瓣例子

# !/usr/bin/env Python3# -*- coding: utf-8 -*-# @Author : zsc# @FILE : 多线程抓取猫眼电影.py# @Time : 2019/4/8 10:27# @Software : PyCharm#-*- coding: utf-8 -*-import reimport osimport time...
复制链接

扫一扫