import requests
from bs4 import BeautifulSoup
url_a='https://movie.douban.com/top250'
def download_page(url):
data=requests.get(url).content
return data
def parse_html(html):
soup=BeautifulSoup(html)
movie_name_list=[]
movie_list_soup=soup.find('ol',attrs={'class':"grid_view"})
for movie_li in movie_list_soup.find_all('li'):
detail=movie_li.find('div',attrs={'class':'hd'})
movie_name=(detail.find('span',attrs={'class':'title'}).getText()).encode("utf8")
movie_name_list.append(movie_name)
print(movie_name)
next_page=soup.find('span',attrs={'class':'next'}).find('a')
if next_page:
print('ok'+' '+url_a+next_page['href'])
return movie_name_list,url_a+next_page['href']
return movie_name_list,None
def main():
url_t=url_a
while(url_t):
html = (download_page(url_t))
movies,url_t=parse_html(html)
print(movies)
#print(parse_html(html))
if __name__=='__main__':
main()
python 爬取电影
最新推荐文章于 2024-08-22 17:32:24 发布