使用bs4爬取豆瓣250
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
def get_urls(url): # 获取子页面的 url
resp = requests.get(url,headers=headers)
html = resp.text
soup = BeautifulSoup(html,'lxml')
lis = soup.find('ol',class_='grid_view').find_all('li')
detail_urls = []
for li in lis:
urls = li.find('a')['href']
detail_urls.append(urls)
return detail_urls
def get_data(url,f):
resp = requests.get(url,headers=headers)
html = resp.text
soup = BeautifulSoup(html,'lxml')
name = list(soup.find('h1').stripped_strings)
name = ''.join(name)
director = list(soup.find('div', id='info').find('span', class_='attrs').stripped_strings)
actor = list(soup.find('span', class_='actor').find('span', class_='attrs').stripped_strings)
f.write('{},{},{}\n'.format(name, ''.join(director), ''.join(actor))) # 将列表元素组合
def main():
base_url = 'https://movie.douban.com/top250?start={}&filter=' # 根据换页的关系,用来换页
with open('Top250.csv','a',encoding='utf-8') as f:
for x in range(0,251,25):
url = base_url.format(x)
detail_urls = get_urls(url)
for detail_url in detail_urls:
get_data(detail_url,f)
if __name__ == '__main__':
main()
确实比xpath简单(不过我选择xpath),,在正则之前好好复习几天吧。。。