爬取电影Top 100
爬取猫眼电影中榜单栏目中 TOP100 榜的所有电影信息(10页信息全部爬取),字段要求(序号、图片、电影名称、主演、时间、评分),并将信息写入文件中(具体参考 URL 地址:http://maoyan.com/board/4)。
使用浏览器访问https://maoyan.com/board/4?offset=0,研究每个列表项目的结构,代码如下。注意,这个结构可能会发生变化,需要动态跟踪。
<div class="main">
<p class="update-time">2019-12-31<span class="has-fresh-text">已更新</span></p>
<p class="board-content">榜单规则:将猫眼电影库中的经典影片,按照评分和评分人数从高到低综合排序取前100名,每天上午10点更新。相关数据来源于“猫眼电影库”。</p>
<dl class="board-wrapper">
<dd> <i class="board-index board-index-1">1</i> <a href="/films/1203" title="霸王别姬" class="image-link" data-act="boarditem-click" data-val="{movieId:1203}"> <img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" /> <img data-src="https://p0.meituan.net/movie/ce4da3e03e655b5b88ed31b5cd7896cf62472.jpg@160w_220h_1e_1c" alt="霸王别姬" class="board-img" /> </a>
<div class="board-item-main">
<div class="board-item-content">
<div class="movie-item-info">
<p class="name"><a href="/films/1203" title="霸王别姬" data-act="boarditem-click" data-val="{movieId:1203}">霸王别姬</a></p>
<p class="star"> 主演:张国荣,张丰毅,巩俐 </p>
<p class="releasetime">上映时间:1993-07-26</p>
</div>
<div class="movie-item-number score-num">
<p class="score"><i class="integer">9.</i><i class="fraction">5</i></p>
</div>
</div>
</div>
</dd>
</dl>
</div>
为了防备爬虫爬取信息,一般网站会添加人工验证操作。猫眼现在也添加该功能,简单破解,是先人工访问一次,然后再机器自动爬取。或者使用网络爬虫验证码识别–极验滑动验证码识别(https://github.com/GYT0313/SlidingCheck)
from urllib import request, error
import re, time, os, json
import re, time, os, json
import jihuayanzheng
txt_file = "result.txt"
csv_file = "result.csv"
def getPage(url):
''' 爬取指定url页面信息 '''
try:
#定义请求头信息
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
}
# 封装请求对象
req = request.Request(url,headers=headers)
# 执行爬取
res = request.urlopen(req)
#判断响应状态,并响应爬取内容
if res.code == 200:
return res.read().decode("utf-8")
else:
return None
except error.URLError:
return None
def parsePage(html):
''' 解析爬取网页中的内容,并返回字段结果 '''
#定义解析正则表达式
pat = '.*?<img.*?src="(.*?)" alt="(.*?)" class="board-img" />.*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p>.*?<i class="integer">([0-9\.]+)</i><i class="fraction">([0-9]+)</i>'
#执行解析
items = re.findall(pat, html, re.S)
#遍历封装数据并返回
for item in items:
yield {
'src': item[0],
'alt': item[1],
'star': item[2],
'time': item[3].strip()[5:],
'integer': item[4]+ item[5],
}
def writeFile(content):
''' 执行文件追加写操作 '''
#print(content)
with open(txt_file,'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False) + "\n\n\n")
#json.dumps 序列化时对中文默认使用的ascii编码.想输出真正的中文需要指定ensure_ascii=False
def main(offset):
''' 主程序函数,负责调度执行爬虫处理 '''
url = 'https://maoyan.com/board/4?offset=' + str(offset)
print(url)
html = getPage(url)
#判断是否爬取到数据,并调用解析函数
if html:
for item in parsePage(html):
writeFile(item)
# 判断当前执行是否为主程序运行,并遍历调用主函数爬取数据
if __name__ == '__main__':
# 删除原来的文件
if os.path.exists(txt_file):
os.remove(txt_file)
for i in range(10):
main(offset=i*10)
time.sleep(0.5)
爬取图书 Top 信息
分页爬取豆瓣网图书Top250信息,并分别使用三种网页信息解析库(Xpath,BeautifulSoup,PyQuery),并将信息写入文件中。 网址:https://book.douban.com/top250?start=0
安装PyQuery
pip install pyquery
import requests
from requests.exceptions import RequestException
import os, time, json, re
from lxml import etree
from bs4 import BeautifulSoup
from pyquery import PyQuery
# https://book.douban.com/top250?start=0
# https://book.douban.com/top250?start=25
txt_file = "doubanBook250.txt"
def getPage(index):
""" 爬取指定页面 """
url = "https://book.douban.com/top250"
data = {
'start':index,
}
headers = {
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
}
try:
res = requests.get(url, headers=headers, params=data)
if res.status_code == 200:
html = res.content.decode('utf-8')
return html
else:
return None
except RequestException:
return None
def parsePage(which, content):
""" 解析网页内容 """
if which == '1': # 使用XPath解析网页内容
print("parsePage_xpath")
html = etree.HTML(content)
items = html.xpath("//table/tr[@class='item']")
for item in items:
yield {
#'index' : item
'title' : item.xpath(".//div[@class='pl2']/a/@title")[0],
'image' : item.xpath(".//img/@src")[0],
'author': item.xpath(".//p[@class='pl']/text()")[0],
'score' : item.xpath(".//span[@class='rating_nums']/text()")[0],
}
elif which == '2': # 使用BeautifulSoup解析网页内容
print("parsePage_bs4")
soup = BeautifulSoup(content, 'lxml')
items = soup.find_all(name='tr', attrs={'class':'item'})
for item in items:
yield {
#'index' : item
'title' : item.select("div.pl2 a")[0]['title'],
'image' : item.find(name='img').attrs['src'],
'author': item.select("p.pl")[0].get_text(),
'score' : item.select("span.rating_nums")[0].string,
}
elif which == '3': # 使用PyQuery解析网页内容
print("parsePage_pyquery")
doc = PyQuery(content)
items = doc("tr.item")
for item in items.items():
yield {
#'index' :
'title' : item.find("div.pl2 a").attr('title'),
'image' : item.find("img").attr('src'),
'author': item.find("p.pl").text(),
'score' : item.find("span.rating_nums").text(),
}
def storeData(content):
""" 存储解析得到的数据 """
with open(txt_file, 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False) + '\n')
def main(which):
""" 主程序,负责调度执行爬虫任务 """
for page in range(0, 10):
index = page*25
html = getPage(index)
if not html:
print("出错")
break
subIndex = 0
for item in parsePage(which, html):
subIndex = subIndex+1
item['index'] = str(index+subIndex)
print("序号:" + item['index'])
print("书名:" + item['title'])
print("封面:" + item['image'])
print("作者:" + item['author'])
print("评分:" + item['score'])
print('-'*32)
storeData(item)
time.sleep(0.5)
if __name__ == '__main__':
if os.path.exists(txt_file):
os.remove(txt_file)
print("\n 豆瓣图书Top250信息爬取 \n")
print(" 1. XPath\n 2. BeautifulSoup\n 3. PyQuery\n")
which = input(" 请选择解析方式:")
if re.match(r'^[123]$', which):
print(" go...")
main(which)
print("\n File saved in ./%s" %(txt_file))
else:
print(" Sorry!输入不合法")