from urllib import request
import re
import json
import pymysql
import time
import cgi,cgitb
base_url = "http://maoyan.com/board/4?offset={}"
for i in range(0,10):
i*=10
url = base_url.format(i)
head = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"}
req = request.Request(url=url,headers=head)
# print(req)
res = request.urlopen(req)
base_data=res.read().decode("utf-8")
pattern = re.compile('<dd>(.*?)</dd>',re.S)
items = pattern.findall(base_data)
for item in items:
#排行榜
pattern1 = re.findall("board-index-.*?(\d{1,4})</i>",item,re.S)
id=pattern1
#名称
pattern2 = re.search("title=\"(.*?)\"",item,re.S)
name=pattern2.group(1)
#主演
pattern3 = re.findall("star\">([\w\W].*?)</p>",item,re.S)
zhuyan=pattern3[0].strip()
#上映时间
pattern4 = re.findall("releasetime\">(.*?)</p>",item,re.S)
time=pattern4[0].strip()
#评分
pattern5 = re.findall("integer\">(.*?)</i>.*?fraction\">(.*?)</i>",item,re.S)
pingfen=pattern5[0][0]+pattern5[0][1]
#封面
pattern6=re.findall("data-src=\"(.*?)\"",item,re.S)
tupian = pattern6[0]
# a1 = {"排行榜":x1,"电影名":x2,"主演":x3,"上映时间":x4,"评分":x5}
#
# print(a1)
#
# #for i in a1:
# with open("./maoyan.txt","a",encoding="utf-8")as f:
# f.write(json.dumps(a1,ensure_ascii=False)+"\n")
# time.sleep(1)
db = pymysql.connect("192.168.0.154", "root", "123456", "maoyan")
cursor = db.cursor()
cursor.execute("insert into move values (null ,'{a}','{b}','{c}','{d}','{e}')".format(a=name, b=zhuyan, c=time, d=pingfen,e=tupian))
db.commit()
# # print(base_data)
# try:
# cursor.execute(
# "insert into move values ('{a}','{b}','{c}','{d}','{e}')".format(a=name,b=zhuyan,c=time,d=pingfen,e=tupian))
# db.commit()
# print("成功")
# except:
# db.rollback()
# print("失败")
db.close()
python爬虫爬取猫眼电影TOP100
最新推荐文章于 2024-04-29 21:15:20 发布