前几天突然心血来潮想写一个爬虫,所以一开始写了一个抓取时光网数据的爬虫,主要用了beautifulsoup基本命令进行数据抓取,主要代码如下(具体代码说明包含在代码内部,在此就不细说了):
# -*- coding: utf-8 -*-
"""
Created on Thu Jan 18 14:37:37 2018
@author: cxoke
功能:抓取时光网top100的电影名字,导演。类型等
"""
# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
import requests
if __name__ == '__main__':
name=[]
director=[]
types=[]
star=[]
for i in range(2,11):
url = 'http://www.mtime.com/top/movie/top100/index-{}.html'.format(str(i))
#实现翻页功能,进行多页抓取
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}#模拟浏览器进行抓取,防止网站反爬虫
req = requests.get(url = url,headers = headers)
req.encoding = 'utf-8'
html = req.text
bf = BeautifulSoup(html, 'lxml')
daoyans=bf.select('.mov_con h2')#寻找属性为“mov_con”并且定位到位h2的标签抓取里面的内容
for each in daoyans:
name.append(each.text)
daoyans=bf.select('.mov_con p')
for daoyan in daoyans:
if'导演:' in daoyan.text:
director.append(daoyan.text)
if'主演: ' in daoyan.text:
star.append(daoyan.text)
if'类型:' in daoyan.text:
types.append(daoyan.text)
#将抓取的数据写入csv
import pandas as pd
#字典中的key值即为csv中列名
dataframe = pd.DataFrame({'电影名':name,'导演':director,'主演':s