前言
beautiful soup库的一些应用,作为笔记
代码
爬取豆瓣top250电影榜单的一些信息,
链接:https://movie.douban.com/top250
import bs4
import requests
def find_pages(url):
response = open_url(url)
soup = bs4.BeautifulSoup(response.text,'html.parser')
pages =soup.find('span', class_='next').previous_sibling.previous_sibling.text
return int(pages)
def open_url(url):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063'}
response = requests.get(url,headers = headers)
return response
def find_things(url):
response = open_url(url)
soup = bs4.BeautifulSoup(response.text,'html.parser')
film_name = []
name = soup.find_all('div',class_ = 'hd')
for each in name :
film_name.append(each.a.span.text)
film_score = []
score = soup.find_all('span',class_ = 'rating_num')
for each in score :
film_score.append('评分:%s'%each.text)
film_quote = []
quote = soup.find_all('div',class_ = 'star')
for each in quote :
if each.next_sibling.next_sibling != None :
film_quote.append(each.next_sibling.next_sibling.text)
else :
film_quote.append("无")
collect = []
for i in range(len(film_name)):
collect.append(film_name[i]+ ' '+ film_score[i] + film_quote[i] + "\n")
return collect
def main():
host = 'https://movie.douban.com/top250'
pages = find_pages(host)
crawl_result = []
for i in range(pages):
url = host + "/?start=" + str(25 * i)
# response = open_url(url)
result = find_things(url)
crawl_result.extend(result)
with open('豆瓣电影top250爬取结果.txt','w',encoding = 'utf-8') as f :
for each in crawl_result :
f.writelines(each)
if __name__ == "__main__" :
main()
对于这种情况会导致film_quote的数组和前面的film_name的长度不一致,因此可以尝试先搜索评分星级属性,然后用bs4.next_sibling.next_sibling找到 quote,或者添加无。
结果: