import re
from bs4 import BeautifulSoup
import codecs
import requests
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
def pachong(url):
cc = requests.get(url,headers=headers)
cc=cc.text
cc = BeautifulSoup(cc,"html.parser")
print(u'豆瓣top250 \n')
for tag in cc.find_all(attrs={"class":"item"}):
shu = tag.find('em').get_text() #序号
print (shu)
outf.write(u"<tr><th>"+ shu)
name = tag.find_all(attrs={"class":"title"}) #中文名称
zname = name[0].get_text()
print (u'[名称]',zname)
outf.write(u"</th><th>"+ zname)
urlm = tag.find(attrs={"class":"hd"}).a #链接
urls = urlm.attrs['href']
print (u'[链接]',urls)
outf.write(u"</th><th>"+urls)
ping = tag.find(attrs={"class":"star"}).get_text() #评分评论
ping = ping.replace('\n',' ')
ping = ping.lstrip()
mode = re.compile(r'\d+\.?\d*')
mm = mode.findall(ping)
k=0
for n in mm:
if k==0:
print (u"[分数]"+n)
outf.write(u"</th><th>" + n)
elif k==1:
print (u"[评论人数]"+n)
outf.write(u"</th><th>" +n)
k=k+1
yu = tag.find(attrs={"class":"inq"}) #评语
if(yu):
content = yu.get_text()
print (u'[评语]',content)
outf.write(u"</th><th>")
outf.write(content)
outf.write(u"</th></tr>"+"\n")
if __name__=='__main__':
outf = codecs.open('top250.html','a','utf-8')
outf.write(u"<html>\n<body>\n<table border=1>\n")
outf.write(u"<tr><th>序号</th><th>名称</th><th>链接</th><th>评分</th><th>评论人数</th><th>影评</th></tr>")
i = 0
while i<10:
print (u'页码',i+1)
num = i*25
url = 'https://movie.douban.com/top250?start='+str(num)+'&filter='
pachong (url)
i=i+1
outf.write(u"</table>\n</bady>\n</html>")
outf.close()
环境:VS2019
有问题可以留言
参考:《Python网络数据爬取及分析从入门到精通——杨秀璋,颜娜》
百度上的和CSND上的