爬取豆瓣电影信息保存到Excel

 1 from bs4 import BeautifulSoup
 2 import requests
 3 import html.parser
 4 from openpyxl import Workbook,load_workbook
 5 import os
 6 class DouBan(object):
 7 
 8     def __init__(self):
 9         self.url = 'https://movie.douban.com/'
10         self.header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'}
11 
12     def openUrl(self, url):
13         response = requests.get(url,headers=self.header)
14         return response
15 
16     def getUrl(self):
17         response = self.openUrl(self.url)
18         douban_html = response.text
19         # print(douban_html)
20         soup = BeautifulSoup(douban_html,'html.parser')
21         hrefs = soup.select("li.poster > a")
22         return hrefs
23         # for href in hrefs:
24         #     print(href['href']
25     def getMsg(self):
26         hrefs = self.getUrl()
27         for num,href in enumerate(hrefs):
28             msg_list = []
29             print(href['href'])
30             response = self.openUrl(href['href'])
31             html_mover = response.text
32             soup = BeautifulSoup(html_mover,'html.parser')
33             all_info = soup.select('div#content')
34             # print(all_info)
35             title = all_info[0].select('h1')[0].text.replace('\n','')
36             msg_list.append(title)
37             # print(title)
38             info = all_info[0].select('#info')[0].text
39             msg_list.append(info)
40             # print(info)
41             describe = all_info[0].select('div#link-report span')[0].text.replace(' ','')
42             msg_list.append(describe)
43             # print(describe)
44             # return title,info,describe
45             for col in range(3):
46                 self.saveMsg(num+1, col+1,  msg_list[col])
47 
48     def saveMsg(self, row_, column_,msg):
49         # msg = self.getMsg()
50         # a = os.path.exists('//move_msg.xlsx')
51         # if a=False:
52         #     os.mkdir('move_msg.xlsx')
53         
54         wb = load_workbook('move_msg.xlsx')
55         sheet = wb.active
56         sheet.cell(row=row_, column=column_).value = msg
57         wb.save('move_msg.xlsx')
58 
59 
60 
61 
62 if __name__ == "__main__":
63     db = DouBan()
64     db.getMsg()

 

转载于:https://www.cnblogs.com/royfans/p/7474662.html

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值