入门爬虫,爬取豆瓣top250,并保存到excel
import pandas as pd
import requests
from bs4 import BeautifulSoup
def get_html(url):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.58'}
resp = requests.get(url,headers=headers)
return resp
def html_parse(resp):
soup = BeautifulSoup(resp.text, 'lxml')
alldiv = soup.find_all('div', class_='pl2')
names = [a.find('a')['title'] for a in alldiv]
allp = soup.find_all('p',class_='pl')
authors = [p.get_text() for p in allp]
starspan = soup.find_all('span', class_='rating_nums')
scores = [s.get_text() for s in starspan]
sumspan = soup.find_all('span', class_='inq')
sums = [i.get_text() for i in sumspan]
data = {
'name':names,
'author':authors,
'score':scores,
'sum':sums
}
return data
def all_page():
base_url = 'http://book.douban.com/top250?start='
urllist = []
for page in range(0, 250, 25):
allurl = base_url + str(page)
urllist.append(allurl)
return urllist
data = {'name':[],'author':[],'score':[],'sum':[]}
url_list = all_page()
for url in url_list:
data['name'] = data['name'] + html_parse(get_html(url))['name']
data['author'] = data['author'] + html_parse(get_html(url))['author']
data['score'] = data['score'] + html_parse(get_html(url))['score']
data['sum'] = data['sum'] + html_parse(get_html(url))['sum']
df = pd.DataFrame.from_dict({key:pd.Series(value) for key, value in data.items()})
df.to_excel('图书表.xlsx',sheet_name='2020收集',na_rep=" ",)
print("完成")