将爬取的电影解析并写入excel中

#将解析的电影写入excel中
import requests #导入网页请求库
from bs4 import BeautifulSoup #导入网页分析库
import pprint#打印格式
import json
import xlwt
#传入URL
def start_request(url):
    r = requests.get(url)
    return r.text
#解析URL

def parser(text):
    value_list = []
    soup = BeautifulSoup(text, 'html.parser')
    movie_list = soup.find_all('div',attrs={'class':'item'})
    for movie in movie_list:
        mydict = {}
        mydict['title'] = movie.find('span', class_ = 'title').text
        #print( mydict)
        for value in mydict.values():
            value_list.append(value)
            #print(value_list)
            #print(len(value_list))
            wbk = xlwt.Workbook()
            sheet = wbk.add_sheet('movies')
            #for i in range()
    #print(len(value_list))
    #print(value_list)
    a = 0
    sheet.write(0, 3,'movie')
    for i in range(len(value_list)):
        sheet.write(i+1,3, value_list[a])
        wbk.save('list.xls')
        a = a + 1
    return value_list
def main():
        url = 'https://movie.douban.com/top250?start={}&filter='
        text = start_request(url)
        parser(text)
if __name__ == '__main__':
    main()

以上是之前参考一篇文章所写的爬取豆瓣250页面内容,今天又重新写了一遍,分为两版,一个是没有封装为函数,另一个封装成函数,这个小练习主要熟悉了beautifulsoap的用法,使用beautifulsoap先初步提取出网页全部源码,之后使用findall方法按照标签匹配相应内容,但是在提取出的内容并不完全是我想要的,所有再一次使用正则表达式进行精确提取,或者使用for循环、if语句筛选内容。提取到Excel部分的内容在另一篇文章中也写过,我基本上还是按照原来的方法做的,详见https://blog.csdn.net/xd060606/article/details/86503872

未封装成函数的

import requests
import re#正则表达式  regular expression
from bs4  import BeautifulSoup
import xlwt

#爬取整体内容
url = 'https://movie.douban.com/top250'
res = requests.get(url)
soup = BeautifulSoup(res.text, "html.parser")

#对内容分类
#标题
movie_title = soup.find_all("span", "title")
movie_name = re.findall(r'.*?title">(.*?)<', str(movie_title))
# print(movie_name)
for i in movie_name:
    if '\xa0/\xa0' in i :
        movie_name.remove(i)
# print(movie_name)

#描述
movie_desc01 = soup.find_all( "p", class_= "") 
movie_desc = []
for i in movie_desc01:
    movie_desc.append(i.text.replace('</p>, <p class="">', '').replace('\n                            ', '').replace('\xa0','').replace('\n                        ','')) 
# print(movie_desc)

#评分
movie_score01 = soup.find_all('span', class_='rating_num')
movie_score = re.findall(r'.*?average">(.*?)<', str(movie_score01))
# print(movie_score)

#标签
movie_quote01 = soup.find_all('span', class_= 'inq')
movie_quote = re.findall(r'.*?inq">(.*?)<', str(movie_quote01))

#写入Excel
wd = xlwt.Workbook()
ws = wd.add_sheet("douban_250")
ws.write(0, 0, '电影名')
ws.write(0, 1, '描述')
ws.write(0, 2, '分数')
ws.write(0, 3, '标签')

for i in range(len(movie_name)):
    ws.write(i+1, 0, movie_name[i])
    ws.write(i+1, 1, movie_desc[i])
    ws.write(i+1, 2, movie_score[i])
    ws.write(i+1, 3, movie_quote[i])

wd.save("douban.xls")

封装成函数

import requests
import re#正则表达式  regular expression
from bs4  import BeautifulSoup
import xlwt


def catch_title(soup):
    movie_title = soup.find_all("span", "title")
    movie_name = re.findall(r'.*?title">(.*?)<', str(movie_title))
# print(movie_name)
    for i in movie_name:
        if '\xa0/\xa0' in i :
            movie_name.remove(i)
    return movie_name

def catch_desc(soup):
    movie_desc01 = soup.find_all( "p", class_= "") 
    movie_desc = []
    for i in movie_desc01:
        movie_desc.append(i.text.replace('</p>, <p class="">', '').replace('\n                            ', '').replace('\xa0','').replace('\n                        ','')) 
    return movie_desc


def catch_score(soup):
    movie_score01 = soup.find_all('span', class_='rating_num')
    movie_score = re.findall(r'.*?average">(.*?)<', str(movie_score01))
    return movie_score

def catch_quote(soup):
    movie_quote01 = soup.find_all('span', class_= 'inq')
    movie_quote = re.findall(r'.*?inq">(.*?)<', str(movie_quote01))
    return movie_quote

def save_excel(movie_name, movie_desc, movie_score, movie_quote):
    wd = xlwt.Workbook()
    ws = wd.add_sheet("douban_250")
    ws.write(0, 0, '电影名')
    ws.write(0, 1, '描述')
    ws.write(0, 2, '分数')
    ws.write(0, 3, '标签')
    for i in range(len(movie_name)):
        ws.write(i+1, 0, movie_name[i])
        ws.write(i+1, 1, movie_desc[i])
        ws.write(i+1, 2, movie_score[i])
        ws.write(i+1, 3, movie_quote[i])

    wd.save("douban02.xls")
    

def main():
    
    url = 'https://movie.douban.com/top250'
    res = requests.get(url)
    soup = BeautifulSoup(res.text, "html.parser")
        # print(soup)
    name = catch_title(soup)
    desc = catch_desc(soup)
    score = catch_score(soup)
    quote = catch_quote(soup)
    save_excel(name, desc, score, quote)
        

if __name__ == "__main__":
    main()






总体而言,推荐大家使用函数对内容进行封装,后期修改或者添加功能会更加清晰明了。

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值