Python3爬虫入门之爬取豆瓣Top250电影名称
准备工具
- Python3.5
- requests
- BeautifulSoup
- lxml
最终效果
![这里写图片描述](https://img-blog.csdn.net/20170305205718281?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvcXFfMzE3NTI3Mjc=/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast)
- 首先看一下网站的结构
可以很清楚的看到每个电影对应了一个<li>
标签,我们只需要一步一步的从<ol>
向下搜索,可以得到电影对应的名称,即<span class="titile">肖申克的救赎</span>
这一行 - 接着看一下网页内 后页按钮对应的代码结构
可以看出后一页的URL为 https://movie.douban.com/top250?start=25&filter=
最后一页这没有这个标签 对应None
这样我们就可以进行翻页了
直接上代码 - 获取html代码
这里使用requests模块,获取很方便
import requests
def download_page(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
data = requests.get(url, headers=headers).content
return data
- 解析html
获取到html源码后就要对其进行解析了,这里使用BeautifulSoup模块
from bs4 import BeautifulSoup
URL='https://movie.douban.com/top250'
def parse_html(html):
soup = BeautifulSoup(html,'lxml')
movie_name_list = []
movie_list_soup = soup.find('ol', attrs={'class':'grid_view'})
for movie_li in movie_list_soup.find_all('li'):
detail = movie_li.find('div', attrs={'class':'hd'})
movie_name = detail.find('span', attrs={'class':'title'}).getText()
movie_name_list.append(movie_name)
next_page = soup.find('span',attrs={'class':'next'}).find('a')
if next_page:
return movie_name_list,URL+next_page['href']
return movie_name_list,None
from bs4 import BeautifulSoup
URL='https://movie.douban.com/top250'
def parse_html1(html):
soup = BeautifulSoup(html, 'lxml');
movie_names = []
movie_list = soup.select('ol.grid_view li div.item div.info div.hd a')
for movie_title in movie_list:
movie_name = movie_title.find('span',class_='title')
movie_names.append(movie_name.getText())
next_page = soup.find('span',class_='next').find('a')
if next_page:
return movie_names,URL+next_page['href']
return movie_names,None
import requests
from bs4 import BeautifulSoup
URL='https://movie.douban.com/top250'
def download_page(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
data = requests.get(url, headers=headers).content
return data
def parse_html1(html):
soup = BeautifulSoup(html, 'lxml');
movie_names = []
movie_list = soup.select('ol.grid_view li div.item div.info div.hd a')
for movie_title in movie_list:
movie_name = movie_title.find('span',class_='title')
movie_names.append(movie_name.getText())
next_page = soup.find('span',class_='next').find('a')
if next_page:
return movie_names,URL+next_page['href']
return movie_names,None
def main():
url = URL
with codecs.open('e:/movies.txt','wb',encoding='utf-8') as fp:
while url:
html = download_page(url)
movies,url=parse_html1(html)
for movie_name in movies:
fp.write(movie_name)
fp.write('\r\n')
if __name__=='__main__':
main()