猫眼爬取网站为: https://maoyan.com/films
该网页中相关记录存放在
result.txt
会在本地创建一个photo的文件夹里面是网页中,电影名称和电影图片
import json
import requests
from requests.exceptions import RequestException
import re
import os
def get_one_page(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def ping_fen(item):
if 'integer' in item:
return item.strip()[19:21] + item.strip()[45:46]
else:
return item
def parse_one_page(html,url):
pattern = re.compile('<dd>.*?movie-item.*?a.*?="(.*?)" target.*?poster-default.*?img.*?="(.*?)" />.*?movie-item-title.*?a.*?>(.*?)</a>.*?channel-detail-orange.*?>(.*?)</div>', re.S)
items = re.findall(pattern, html)
for item in items:
yield {
'购票地址': url + item[0],
'电影图片': item[1],
'电影名称': item[2],
'评分': ping_fen(item[3])
}
def write_to_file(content):
with open('result.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False) + '\n')
def DeatleFile(file_name):
path = os.getcwd() + '\\' + file_name
for i in os.listdir(os.getcwd()):
if file_name in i:
for ii in os.listdir(path):
os.remove(path + '\\' + ii)
os.removedirs(path)
def CreatFile(file_nam):
os.makedirs(os.getcwd() + '\\' + file_nam)
def ReadFile(file_name):
fo = open(file_name, "r+",encoding='UTF-8')
lines = fo.readlines()
photo_list = []
for line in lines:
pattern = re.compile('电影图片.*? "(.*?)".*?电影名称".*?"(.*?)"',re.S)
listA = re.findall(pattern, line)
if listA != None:
photo_list.append(listA)
return photo_list
def DownPhoto(list_html,file_name):
for photo_httml in list_html:
for messgae in photo_httml:
r = requests.get(messgae[0])
with open(os.getcwd() + '/' + file_name + '/' + str(messgae[1]) + '.png', 'wb') as f:
f.write(r.content)
f.close()
def Creat_Photo(file_name):
DeatleFile(file_name)
CreatFile(file_name)
listA = ReadFile('result.txt')
DownPhoto(listA, file_name)
def main():
url = 'https://maoyan.com'
html = get_one_page(url + '/films')
for item in parse_one_page(html, url):
write_to_file(item)
if __name__ == '__main__':
main()
Creat_Photo('photo')