# !/usr/bin/env Python3
# -*- coding: utf-8 -*-
# @Author : zsc
# @FILE : 多线程抓取猫眼电影.py
# @Time : 2019/4/8 10:27
# @Software : PyCharm
#-*- coding: utf-8 -*-
import re
import os
import time
import json
import requests
from multiprocessing import Pool
from requests.exceptions import RequestException
def get_one_page(url):
'''
获取网页html内容并返回
'''
try:
# 获取网页html内容
response = requests.get(url)
# 通过状态码判断是否获取成功
if response.status_code == 200:
return response.text
return None
except RequestException as e:
print("打印错误信息:", e)
return None
def parse_one_page(html):
'''
解析HTML代码,提取有用信息并返回
'''
# 正则表达式进行解析
pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name">'
+ '<a.*?>(.*?)</a>.*?"star">(.*?)</p>.*?releasetime">(.*?)</p>'
+ '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
# 匹配所有符合条件的内容
items = re.findall(pattern, html)
for item in items:
yield {
'index': item[0],
'image': item[1],
'title': item[2],
'actor': item[3].strip()[3:],
'time': item[4].strip()[5:],
'score': item[5] + item[6]
}
def write_to_file(content):
'''
将文本信息写入文件
'''
with open('result.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False) + '\n')
f.close()
def save_image_file(url, path):
'''
保存电影封面
'''
ir = requests.get(url)
if ir.status_code == 200:
with open(path, 'wb') as f:
f.write(ir.content)
f.close()
def main(offset):
url = 'http://maoyan.com/board/4?offset=' + str(offset)
html = get_one_page(url)
# 封面文件夹不存在则创建
if not os.path.exists('covers'):
os.mkdir('covers')
# print(list(parse_one_page(html)))
for item in parse_one_page(html):
print(item)
write_to_file(item)
save_image_file(item['image'], 'covers/' + '%03d'%int(item['index']) + item['title'] + '.jpg')
if __name__ == '__main__':
start = time.time()
print("程序开始时间:", start)
# 使用多进程提高效率
pool = Pool()
# a = [0, 10, 20, 30, 40, 50 , 60, 70, 80,90]
# for i in a:
# main(i)
# 9.410544395446777 循环时间
pool.map(main,[i*10 for i in range(10)])
end = time.time()
print("程序结束时间:", end)
print("总时间为: end - start", end - start)
# 2.1735799312591553 多进程时间
这个爬取的效率还是相当高的, 开了 10 个进程一起抓取, 使用 map函数直接进行, 这些的数据量并不是很大, 但是也能比较的出来, 速度相差4倍。 也就是说 四个小时的东西, 一个小时就可以解决了。
from multiprocessing.dummy import Pool as ThreadPool 的用法
from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
import requests
import re
import sys
import time
def spider(url):
print(url)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
html = requests.get(url, headers=headers) #伪装成浏览器
selector = etree.HTML(html.text) #将网页html变成树结构,用于xpath
content = selector.xpath('//figure[@class="post-image "]') #提取figure标签
for each in content:
tmp = each.xpath('a/img/@src')#把img标签的src属性提取出来
pic = requests.get(tmp[0])#访问图片
print('downloading: ' + tmp[0])
string = re.search(r'\d+/\d+/(.*?)\\.jpg', str(tmp[0])).group(1) #正则表达式匹配图片名字
with open('pic2\\'+string+'.jpg', "wb") as f:
f.write(pic.content)
if __name__ == '__main__':
pool = ThreadPool(2) #双核电脑
pool.map(spider, ['http://hotpics.cc/page/' + str(i) for i in range(1,11)]) #多线程工作
pool.close()
pool.join()
# !/usr/bin/env Python3
# -*- coding: utf-8 -*-
# @Author : zsc
# @FILE : 昆山.py
# @Time : 2019/5/23 11:24
# @Software : PyCharm
import os
import re
import time
import requests
import pandas as pd
from lxml import etree
from multiprocessing import Pool
def index(df = pd.DataFrame()):
index_url = "http://www.kshome.com.cn/Ksht/RoomInfo.aspx"
headers = {
'Cookie':'ASP.NET_SessionId=3vipldp1m2awav0zm14dplv2',
'Host':'www.kshome.com.cn',
'Origin':'http://www.kshome.com.cn',
'Referer':'http://www.kshome.com.cn/Ksht/RoomInfo.aspx?id=20819',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
}
params = {"id" :20819}
for i in range(1, 14):
print("当前抓取页码:", i)
index_res = requests.get(url=index_url, params=params)
VIEWSTATE = re.findall(r'name="__VIEWSTATE" id="__VIEWSTATE" value="(.*?)" />', index_res.text, re.DOTALL)[0]
EVENTVALIDATION = re.findall(r'name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="(.*?)" />', index_res.text, re.DOTALL)[0]
data = {
'__EVENTTARGET': 'GridView1$_ctl43$btnGo',
'__EVENTARGUMENT': '',
'__VIEWSTATE': VIEWSTATE,
'__EVENTVALIDATION': EVENTVALIDATION,
'txtCs': '',
'txtSh': '',
'GridView1:_ctl43:txtNewPageIndex': i,
}
response = requests.post(url = index_url, headers = headers, params = params, data = data)
html = etree.HTML(response.text)
a = html.xpath("//table[@id='GridView1']//tr/td[1]/text()")
b = html.xpath("//table[@id='GridView1']//tr/td[2]/text()")
c = html.xpath("//table[@id='GridView1']//tr/td[3]/text()")
d = html.xpath("//table[@id='GridView1']//tr/td[4]/text()")
e = html.xpath("//table[@id='GridView1']//tr/td[5]/text()")
a = a[:len(b)]
df1= pd.DataFrame([a, b, c, d, e]).T
df = df.append(df1, ignore_index=True)
print("___________________________________________")
df.drop_duplicates(inplace=True)
return df
def main(number):
print("AAAAAAAAAAAAAAAAAAAAAAAAAAAA", number)
if os.path.exists("./all3.xlsx"):
df = pd.read_excel("./all3.xlsx")
df = index(df)
else:
df = index()
print(len(df))
df.to_excel("all3.xlsx")
if __name__ == '__main__':
start = time.time()
print("程序开始时间:", start)
pool = Pool()
pool.map(main, [i for i in range(10)])
print("1000000000000")
pool.close()
pool.join()
end = time.time()
print("程序结束时间:", end)
print("总时间为: end - start", end - start)