tags: python, yield, csv, re, urllib
ps: 以下用到的模块都是内置的, 无须安装.
最近学了scrapy 框架,发现了 yield 的影子,所以打算在这个栗子中也来复习复习 yield
和 yield from
。顺便体验一下传说中使用正则解析数据的痛苦。而为了更好体验劳动成果,使用 csv 文件格式来存储爬取的数据。发现少了几个电影的数据拿不到,但是单独把提取不出来源数据部分,使用正则解析一遍,又没有发现问题,所以正则那一块还需要改一改。
import csv
import re
import time
import urllib.request
from fake_useragent import UserAgent
# import os, http.cookiejar
# 将数据存到 csv 文件中
def save2csv(generator, filepath, field_names):
with open(filepath, 'w', encoding="utf-8", newline='', ) as fp:
# 使用字典的方式写入, 个人比较推荐
write = csv.DictWriter(fp, fieldnames=field_names)
# 写入头部信息
write.writeheader()
while True:
try:
msg_list = next(generator)
# for msg in msg_list:
# write.writerow(msg)
# 既然已经是 list, 那为什么不用 writerows(内置使用map 高阶函数映射) 呢
write.writerows(msg_list)
except StopIteration:
# 生成器停止, 就是拿不到数据了
print("StopIteration. data is none.")
break
except Exception as e:
print(e)
# 遍历十个页面 (生成器函数)
def request_url(start_url, headers):
for i in range(10):
url = start_url.format(i * 10)
print(url)
request = urllib.request.Request(url, headers=headers)
# 不用这种方式了, 直接传参 headers
# request.add_header("User-Agent", headers["User-Agent"])
# request.add_header("Cookie", headers["Cookie"])
response = urllib.request.urlopen(request)
# msg_list = pick_movie_msgs(response)
# print(len(msg_list), msg_list)
# if msg_list:
# yield msg_list
yield from pick_movie_msgs(response)
# 防止把别人的服务器轰坏了, 也防止被封
time.sleep(3)
# 预编译正则, 终于体会到 `.*?` 的厉害之处了
_rule = re.compile(r'<dd>.*?>(\d+)</i>' # 序号
+ r'.*?src="https(.*?)".*?>' # 封面 url **
+ r'.*?title="(.*?)"' # 电影名称
+ r'.*?star">(.*?)</p>' # 主演
+ r'.*?releasetime">.*?(\d+-\d+-\d+)' # 上映时间
+ r'.*?integer">(\d+.).*?fraction">(\d).*?</dd>', flags=re.S) # 评分
# 取出有用的信息
def pick_movie_msgs(response):
page_source_data = response.read()
page_source = page_source_data.decode("utf-8")
# print(page_source)
data_list = _rule.findall(page_source)
# print(data_list)
# msg_list = []
for data in data_list:
# data is type of tuple
if data:
msg = {
"order_number": data[0].strip(),
"img_url": "".join(("https", data[1].strip())),
"title": data[2].strip(),
"star": data[3].strip().split(":")[1],
"release_time": data[4].strip(),
"score": "".join((data[5].strip(), data[6].strip())),
}
# msg_list.append(msg)
# return msg_list
yield msg
# 程序入口函数
def main():
# 猫眼电影榜单 top 100
start_url = "https://maoyan.com/board/4?offset={}"
headers = {
"User-Agent": UserAgent().random,
"Cookie": "__mta=188595863.1563115601062.1563116145904.1563361495934.20; uuid_n_v=v1; uuid=3012C9D0A64611E9AB785F96570DC49513513EE76D2F46EDBCA08C92F92471ED; _lxsdk_cuid=16bf0f33aa941-0e8e80acdaf83f-e343166-e1000-16bf0f33aaac8; _lxsdk=3012C9D0A64611E9AB785F96570DC49513513EE76D2F46EDBCA08C92F92471ED; _csrf=f6517f9b55f5215e1d80ab9ec8c7885968e50a0d83dd8bac15126cdd0969810c; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; __mta=188595863.1563115601062.1563116145904.1563361420789.20; _lxsdk_s=16bff99cb1a-d00-c10-473%7C%7C8",
}
filepath = './maoyan.csv'
data_head = ["order_number", "img_url",
"title", "star", "release_time", "score"]
# 生成器
generator = request_url(start_url, headers)
print(type(generator))
save2csv(generator, filepath, data_head)
# 开始爬
main()