#coding: utf-8#author: hmk
importrequestsfrom bs4 importBeautifulSoup
import bs4importpymysql.cursorsdefget_html(url, header):try:
r= requests.get(url=url, headers=header, timeout=20)
r.encoding=r.apparent_encodingif r.status_code == 200:returnr.textelse:returnNoneexcept:returnNonedefget_data(html, list_data):
soup= BeautifulSoup(html, 'html.parser')
dd= soup.find_all('dd')for t indd:
if isinstance(t, bs4.element.Tag): # 判断t是否为bs4的tag对象(因为解析出的dd标签中可能有空行)
ranking= t.i.string #排名
movie = t.find('p', class_='name').string
release_time= t.find('p', class_='releasetime').string
score= t.find('p', class_='score').contents[0].string + t.find('p', class_='score').contents[1].string
list_data.append([ranking, movie, release_time, score])defwrite_sql(data):
conn= pymysql.connect(host='localhost',
user='root',
password='123456',
db='test',
charset='utf8')
cur=conn.cursor()for i indata:"""这里的data参数是指正则匹配并处理后的列表数据(是一个大列表,包含所有电影信息,每个电影信息都存在各自的一个列表中;
对大列表进行迭代,提取每组电影信息,这样提取到的每组电影信息都是一个小列表,然后就可以把每组电影信息写入数据库了)"""movie= i #每组电影信息,这里可以看做是准备插入数据库的每组电影数据
sql = "insert into maoyan_movie(ranking,movie,release_time,score) values(%s, %s, %s, %s)" #sql插入语句
try:
cur.execute(sql, movie)#执行sql语句,movie即是指要插入数据库的数据
conn.commit() #插入完成后,不要忘记提交操作
print('导入成功')except:print('导入失败')
cur.close()#关闭游标
conn.close() #关闭连接
defmain():
start_url= 'http://maoyan.com/board/4'depth= 10 #爬取深度(翻页)
header = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8","Accept-Encoding": "gzip, deflate, sdch","Accept-Language": "zh-CN,zh;q=0.8","Cache-Control": "max-age=0","Connection": "keep-alive","Host": "maoyan.com","Referer": "http://maoyan.com/board","Upgrade-Insecure-Requests": "1","User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36"}for i inrange(depth):
url= start_url + '?offset=' + str(10 *i)
html=get_html(url, header)
list_data=[]
get_data(html, list_data)
write_sql(list_data)print(list_data)if __name__ == "__main__":
main()