实现概要说明
找电影时,发现网站自带的搜索功能可选项不多,于是乎写了个脚本,实现按网站标签结果二次搜索,获取电影信息。需要注意的是,使用爬虫脚本需要先用网站标签搜索
功能获取搜索标签页结果的地址,然后把地址保存下来运行脚本。
源代码
#coding:utf-8
import urllib.request
from bs4 import BeautifulSoup
import time
# 访问获取网页结果
def GetWeb(web_url):
req = urllib.request.Request(web_url)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36')
rsp = urllib.request.urlopen(req)
return rsp.read().decode('utf-8')
# 解析网页结果
def ParseMovesTagPage(web_txt):
page_mvs = []
# print(web_txt)
bs = BeautifulSoup(web_txt, 'html5lib')
lt3 = bs.find_all('dl')
for v in lt3:
mv = ParseTagDl(v)
if mv:
page_mvs.append(mv)
return page_mvs
def ParseTagDl(bs_dl):
dd = {}
dt = bs_dl.dt
if not dt:
return dd
va = [v.string.strip() for v in dt.contents]
va = [v for v in va if len(v)>0]
if len(va)>=4:
dd['name'] = va[1]
dd['douban'] = va[2]
dd['imdb'] = va[3]
return dd
# 结果写文件
def FileAdd(fn, txt):
with open(fn, 'a+') as fd:
fd.write(txt)
# 代码测试
if __name__ == '__main__':
fp = open('bd.txt', 'a+')
# 根据搜索结果的标签,改写此处的tag数字编号
web_url = r'http://www.bd-film.co/tag/58_'
for i in range(1,1000):
cur_url = web_url+str(i)+'.jspx'
try:
print(cur_url)
txt = GetWeb(cur_url)
mvs = ParseMovesTagPage(txt)
for mv in mvs:
# 条件过滤
if mv['douban']>'8' or mv['imdb']>'8':
info = '%s \t %s \t %s'%(mv['douban'], mv['imdb'], mv['name'])
FileAdd('bd-高分剧情.txt', info+'\n')
print(info)
except:
print('error->'+cur_url)
time.sleep(1)
fp.close()
运行结果截图
这样就可以很开心地根据自己的需要找电影了,我一般喜欢随大众,所以写成按电影评分搜索,嘿嘿嘿~~