#print(url)
r=requests.get(url)
if r.status_code !=200:
raise Exception('error')
htmls.append(r.text)
#print(htmls)
return htmls
def parse_single_html(html):
soup=bs4.BeautifulSoup(html,'html.parser')
#找到所有article标签的html
article_items=soup.find_all('div',class_='item')
#print(article_items)
datas=[]
for i in article_items:
title=i.find('span',class_="title").get_text()
rank=i.find('em',class_='').get_text()
rating_num=i.find('span',class_="rating_num").get_text()
comments=i.find('div',class_='star').find_all('span')[3].get_text()
datas.append({
'rank':rank,
'title':title,
'rating_num':rating_num,
'comments':comments.replace('人评价','')
})
#print(datas)
return datas
if __name__=='__main__':
htmls=creat_url()
alldata=[]
for html in htmls:
alldata.extend(parse_single_html(html))
#print(alldata)
df=pd.DataFrame(alldata,columns=['rank','title','rating_num','comments'])
print(df)
df.to_csv('豆瓣电影Top250.csv',index=False)
运行结果如下:
来源网络,侵权联系删除