python电影数据爬取的代码_python 爬取猫眼电影top100数据

#-*- coding: utf-8 -*-#@Author : yocichen#@Email : yocichen@126.com#@File : maoyan100.py#@Software: PyCharm#@Time : 2019#@UpdateTime : 2020/4/26

importrequestsfrom requests importRequestExceptionimportreimportopenpyxlimporttraceback#Get page's html by requests module

defget_one_page(url):try:

headers={'user-agent': 'Mozilla / 5.0(Windows NT 10.0; WOW64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 53.0.2785.104Safari / 537.36Core / 1.53.4882.400QQBrowser / 9.7.13059.400'}#Sometimes, the proxies need to be replaced.

#You can get them by accessing https://www.kuaidaili.com/free/inha/

proxies ={'http': '60.190.250.120:8080'}#use headers to avoid 403 Forbidden Error(reject spider)

response = requests.get(url, headers=headers, proxies=proxies)if response.status_code == 200:returnresponse.textreturnNoneexceptRequestException:

traceback.print_exc()returnNone#Get useful info from html of a page by re module

defparse_one_page(html):try:

pattern= re.compile('

.*?board-index.*?>(\d+)<.*?

+'.*?data-src="(.*?)".*?.*?star">[\\s]*(.*?)[\\n][\\s]*

.*?'

+'releasetime">(.*?)

.*?integer">(.*?).*?'

+'fraction">(.*?).*?

', re.S)

items=re.findall(pattern, html)returnitemsexceptException:

traceback.print_exc()return[]#Main call function

defmain(url):

page_html=get_one_page(url)

parse_res=parse_one_page(page_html)returnparse_res#Write the useful info in excel(*.xlsx file)

defwrite_excel_xlsx(items):

wb=openpyxl.Workbook()

ws=wb.active

rows=len(items)

cols=len(items[0])#First, write col's title.

ws.cell(1, 1).value = '编号'ws.cell(1, 2).value = '片名'ws.cell(1, 3).value = '宣传图片'ws.cell(1, 4).value = '主演'ws.cell(1, 5).value = '上映时间'ws.cell(1, 6).value = '评分'

#Write film's info

for i inrange(0, rows):for j inrange(0, cols):if j != 5:

ws.cell(i+2, j+1).value =items[i][j]else:

ws.cell(i+2, j+1).value = items[i][j]+items[i][j+1]break

#Save the work book as *.xlsx

wb.save('maoyan_top100.xlsx')if __name__ == '__main__':print('spider working...')

res=[]

url= 'https://maoyan.com/board/4?'

for i in range(0, 10):if i ==0:

res=main(url)else:

newUrl= url+'offset='+str(i*10)

res.extend(main(newUrl))print('writing into excel...')

write_excel_xlsx(res)print('work done!\nNote: the data is in the current directory.')

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值