#-*- coding: utf-8 -*-#@Author : yocichen#@Email : yocichen@126.com#@File : maoyan100.py#@Software: PyCharm#@Time : 2019#@UpdateTime : 2020/4/26
importrequestsfrom requests importRequestExceptionimportreimportopenpyxlimporttraceback#Get page's html by requests module
defget_one_page(url):try:
headers={'user-agent': 'Mozilla / 5.0(Windows NT 10.0; WOW64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 53.0.2785.104Safari / 537.36Core / 1.53.4882.400QQBrowser / 9.7.13059.400'}#Sometimes, the proxies need to be replaced.
#You can get them by accessing https://www.kuaidaili.com/free/inha/
proxies ={'http': '60.190.250.120:8080'}#use headers to avoid 403 Forbidden Error(reject spider)
response = requests.get(url, headers=headers, proxies=proxies)if response.status_code == 200:returnresponse.textreturnNoneexceptRequestException:
traceback.print_exc()returnNone#Get useful info from html of a page by re module
defparse_one_page(html):try:
pattern= re.compile('
.*?board-index.*?>(\d+)<.*?+'.*?data-src="(.*?)".*?.*?star">[\\s]*(.*?)[\\n][\\s]*
.*?'+'releasetime">(.*?)
.*?integer">(.*?).*?'+'fraction">(.*?).*?
', re.S)items=re.findall(pattern, html)returnitemsexceptException:
traceback.print_exc()return[]#Main call function
defmain(url):
page_html=get_one_page(url)
parse_res=parse_one_page(page_html)returnparse_res#Write the useful info in excel(*.xlsx file)
defwrite_excel_xlsx(items):
wb=openpyxl.Workbook()
ws=wb.active
rows=len(items)
cols=len(items[0])#First, write col's title.
ws.cell(1, 1).value = '编号'ws.cell(1, 2).value = '片名'ws.cell(1, 3).value = '宣传图片'ws.cell(1, 4).value = '主演'ws.cell(1, 5).value = '上映时间'ws.cell(1, 6).value = '评分'
#Write film's info
for i inrange(0, rows):for j inrange(0, cols):if j != 5:
ws.cell(i+2, j+1).value =items[i][j]else:
ws.cell(i+2, j+1).value = items[i][j]+items[i][j+1]break
#Save the work book as *.xlsx
wb.save('maoyan_top100.xlsx')if __name__ == '__main__':print('spider working...')
res=[]
url= 'https://maoyan.com/board/4?'
for i in range(0, 10):if i ==0:
res=main(url)else:
newUrl= url+'offset='+str(i*10)
res.extend(main(newUrl))print('writing into excel...')
write_excel_xlsx(res)print('work done!\nNote: the data is in the current directory.')