1.爬取豆瓣前250部电影
由于电豆瓣电影中一共存在250部电影,需要进行分页,每隔25进行一次分页截取。
# 构造分页数字列表
page_indexs = range(0, 250, 25)
headers表头为:
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4098.3 Safari/537.36',
'Cookie': 'douban-fav-remind=1; bid=xV3mzxZaZxE; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1558577880%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DV45iHCRmrzprwRNYFC4Tj2b6lo3J1MbSYFIzkgsyd9EERM6EE6OIaTGDe9z7NaJj%26wd%3D%26eqid%3Db8eae7580058b189000000025ce602d1%22%5D; _pk_id.100001.4cf6=cf9fd67f-eed2-4398-a10d-333daee6b8b7.1558577880.1.1558577880.1558577880.; _pk_ses.100001.4cf6=*; __utma=30149280.1353623796.1524120262.1556333363.1558577881.8; __utmb=30149280.0.10.1558577881; __utmc=30149280; __utmz=30149280.1558577881.8.7.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utma=223695111.2144420921.1558577881.1558577881.1558577881.1; __utmb=223695111.0.10.1558577881; __utmc=223695111; __utmz=223695111.1558577881.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic'
}
信息都存在于div.info中,使用response读取数据,以列表形式读取存储到lists,其中包括了name,year,country,category,star,people_num的信息 。其中详细的演员信息需要单独读取。
def get_movie_list(url,headers):
soup = requests.get(url,headers=headers)
response = BeautifulSoup(soup.text,'lxml')
lists = response.select('div.info')
datas = []
for list in lists:
sing_url =list.select('a')[0].get('href')
name =list.select('div.hd .title')[0].text
type_list = list.select('div.bd p')[0].text.strip('').split('...')[-1].replace(' ','').split('/')
year =''.join(type_list[0].split())
country = ''.join(type_list[1].split())
category = ''.join(type_list[2].split())
star = list.select('div.bd .star .rating_num')[0].text.replace(' ','')
people_num = list.select('div.bd .star span:nth-of-type(4)')[0].text.split('人')[0]
data = get_detail_movie(sing_url,name,year,country,category,star,people_num,headers)
datas.append(data)
for info in datas:
if '导' in info['year']:continue
else:
print(info)
mv_mysql.save_data(info)
其中get_detail_movie函数用来单独读取演员信息。
def get_detail_movie(movie_url,name,year,country,category,star,people_num,headers):
response = requests.get(movie_url,headers = headers)
soup = BeautifulSoup(response.text,'lxml')
# daoyan =soup.select('#info > span:nth-of-type(1) > span.attrs')[0].text
yanyuan =[]
for i in soup.select('#info > span.actor > span.attrs'):
yanyuan.append(i.text.replace('/',''))
list1 =str(yanyuan).replace('[','').replace("'",'').split(' ')
yanyuan = ' '.join(list1)
data ={'name':name,
'year':year,
'country':country,
'category':category,
'star':star,
'people_num':people_num,
# 'daoyan':daoyan,
'yanyuan':yanyuan
}
return data
对于信息本文采用mysql数据库存储,采用pymysql,安装比较简单。
import pymysql
首先需要连接到MYSQL数据库中。
#连接MYSQL数据库
db = pymysql.connect("host","usr","password","database" )
cursor = db.cursor()
其中host我选择是自己的主机,usr是你登录mysql数据库用户名,password你的密码,database是你数据库名。
在数据库中建表
def creat_table():
cursor.execute("DROP TABLE IF EXISTS DBDY_250")
sql = '''CREATE TABLE DBDY_250(
name text(5000),
year VARCHAR(1000),
country text(5000),
category text(5000),
star VARCHAR(1000),
people_num VARCHAR(1000),
# daoyan text(5000),
yanyuan text(5000)
);'''
cursor.execute(sql)
return
建立的表中,表名为DBDY_250,存储爬取到的数据:
#存储爬取到的数据
def save_data(data_dict):
sql = '''INSERT INTO DBDY_250(name,year,country,category,star,people_num,yanyuan) VALUES(%s,%s,%s,%s,%s,%s,%s)'''
value_tup = (data_dict['name']
,data_dict['year']
,data_dict['country']
,data_dict['category']
,data_dict['star']
,data_dict['people_num']
# ,data_dict['daoyan']
,data_dict['yanyuan']
)
try:
cursor.execute(sql,value_tup)
db.commit()
except:
print('数据库写入失败')
return
爬取到的文件格式如下:
后续爬取信息会进行数据分析,可视化分析,可以期待一下。需要代码的在评论区留言哦!谢谢,记得点赞哦。