本文为学习记录笔记,原创非常优秀~ 感谢@数挖小飞飞 思密达。
在运行源程序时,发现关于路径的报错。故,本文对@数挖小飞飞 大大的代码做了一小点修改,添加了建立文件夹函数。
原文链接:https://blog.csdn.net/qq_36936730/article/details/104668162
1.修改部分
在运行原程序时,未手动建立文件夹“film_pic”。程序报错如下:
添加函数如下,将creat_dir()添添加至主函数第一行即可。
# create dir
def create_dir():
import os # 引入python 的OS库
file_path = r'E:/PySource/film_pic' # 文件夹路径及名称
if os.path.exists(file_path): # 判断是否已存在同名文件夹,存在则删除后重新创建
os.rmdir(file_path)
os.mkdir(file_path)
else:
os.mkdir(file_path)
2.运行结果
3.完整学习代码
# 发送请求——获得页面——解析页面——抽取并储存内容
import requests
import re
import json
'''
# Web Capture
url = "https://movie.douban.com/top250?start=0&filter="
headers = {
"user-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3742.400 QQBrowser/10.5.3866.400"
}
response = requests.get(url,headers=headers)
text = response.text
# information extraction
regix = '<div class="pic">.*?<em class="">(.*?)</em>.*?<img.*?src="(.*?)" class="">.*?' \
'div class="info.*?class="hd".*?class="title">(.*?)</span>.*?class="other">(.*?)'\
'</span>.*?<div class="bd">.*?<p class="">(.*?)<br>(.*?)</p>.*?' \
'class="star.*?<span class="(.*?)"></span>.*?span class="rating_num".*?average">(.*?)</span>.*?<span>(.*?)</span>.*?' \
'span class="inq"?>(.*?)</span>'
res = re.findall(regix, text, re.S)
print(res)
# image download defination
def down_image(url,name,headers):
r = requests.get(url,headers = headers)
filename = re.search('/public/(.*?)$',url,re.S).group(1)
with open("film_pic/"+name.split('/')[0]+".jpg",'wb') as f:
f.write(r.content)
'''
headers = {
"user-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3742.400 QQBrowser/10.5.3866.400"
}
# create dir
def create_dir():
import os
file_path = r'E:/PySource/film_pic'
if os.path.exists(file_path):
os.rmdir(file_path)
os.mkdir(file_path)
else:
os.mkdir(file_path)
# image download defination
def down_image(url,name,headers):
r = requests.get(url,headers = headers)
filename = re.search('/public/(.*?)$',url,re.S).group(1)
with open("film_pic/"+name.split('/')[0]+".jpg",'wb') as f:
f.write(r.content)
# Web page parsing function
def parse_html(url):
response = requests.get(url,headers=headers)
text = response.text
# 正则表达式头部([1:排名 2:图片] [3:名称 4:别名] [5:导演 6:年份/国家/类型] [7:评星 8:评分 9:评价人数] [10:评价])
regix = '<div class="pic">.*?<em class="">(.*?)</em>.*?<img.*?src="(.*?)" class="">.*?' \
'div class="info.*?class="hd".*?class="title">(.*?)</span>.*?class="other">(.*?)'\
'</span>.*?<div class="bd">.*?<p class="">(.*?)<br>(.*?)</p>.*?' \
'class="star.*?<span class="(.*?)"></span>.*?span class="rating_num".*?average">(.*?)</span>.*?<span>(.*?)</span>.*?' \
'span class="inq"?>(.*?)</span>'
# match all the results
res = re.findall(regix, text, re.S)
for item in res:
rank = item[0]
down_image(item[1],item[2],headers = headers)
name = item[2] + ' ' + re.sub(' ', ' ',item[3])
actor = re.sub(' ', '',item[4].strip())
year = item[5].split('/')[0].strip(' ').strip()
country = item[5].split('/')[1].strip(' ').strip()
tp = item[5].split('/')[2].strip(' ').strip()
tmp = [i for i in item[6] if i.isnumeric()]
if len(tmp) == 1:
score = tmp[0] + '星/' + item[7] + '分'
else:
score = tmp[0] + '星半/' + item[7] + '分'
rev_num = item[8][:-3]
inq = item[9]
# create dictionary
yield{
'电影名称': name,'导演和演员': actor, '类型': tp, '年份': year, '国家': country, '评分': score,'排名': rank, '评价人数': rev_num, '评价': inq
}
# define output function
def write_movies_file(str):
with open('top250_douban_film.txt','a',encoding='utf-8') as f: # write in top250_douban_film.txt
f.write(json.dumps(str, ensure_ascii=False) + '\n')
# define main()
def main():
create_dir()
for offset in range(0,250,25):
url = 'https://movie.douban.com/top250?start=' + str(offset) + '&filter='
for item in parse_html(url):
print(item)
write_movies_file(item)
if __name__ == '__main__':
main()