python多线程爬取_python多线程爬取网页

#-*- encoding:utf8 -*-

'''

Created on 2018年12月25日

@author: Administrator

'''

from multiprocessing.dummy import Pool as pl

import csv

import requests

from lxml import etree

def spider(url):

header = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 \

(KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"}

r = requests.get(url=url, headers=header)

return r.json()

def spider_detail(url):

resp = spider(url)

title = resp.get('data').get('title')

print(title)

content = resp.get('data').get('content')

try:

title_clear = title.replace('|', '').replace('?', '')

content_clear = content.replace('

','\n\n').replace('

','')

sel = etree.HTML(content_clear)

content_clear = sel.xpath('string(//*)')

artical_write(title_clear, content_clear)

print(title_clear)

except:

pass

def get_all_urls(page_number):

for i in range(1, page_number + 1):

url = 'https://36kr.com/api/search-column/mainsite?per_page=20&page=' + str(i)

resp = spider(url)

artical_data = resp.get('data').get('items')

for url_data in artical_data:

number = url_data.get('id')

artical_url = 'https://36kr.com/api/post/'+ str(number) +'/next'

yield artical_url

def artical_write(title, content):

with open('d:/spider_data/11.11/' + title + '.txt', 'wt', encoding='utf-8') as f:

f.write(content)

if __name__ == '__main__':

# 线程数, 默认为cpu核心数

pool = pl(4)

# url列表收集

all_url = []

for url in get_all_urls(100):

all_url.append(url)

# 多线程爬取

pool.map(spider_detail, all_url)

pool.close()

pool.join()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值