#将解析的电影写入excel中
import requests #导入网页请求库
from bs4 import BeautifulSoup #导入网页分析库
import pprint#打印格式
import json
import xlwt
#传入URL
def start_request(url):
r = requests.get(url)
return r.text
#解析URL
def parser(text):
value_list = []
soup = BeautifulSoup(text, 'html.parser')
movie_list = soup.find_all('div',attrs={'class':'item'})
for movie in movie_list:
mydict = {}
mydict['title'] = movie.find('span', class_ = 'title').text
#print( mydict)
for value in mydict.values():
value_list.append(value)
#print(value_list)
#print(len(value_list))
wbk = xlwt.Workbook()
sheet = wbk.add_sheet('movies')
#for i in range()
#print(len(value_list))
#print(value_list)
a = 0
sheet.write(0, 3,'movie')
for i in range(len(value_list)):
sheet.write(i+1,3, value_list[a])
wbk.save('list.xls')
a = a + 1
return value_list
def main():
url = 'https://movie.douban.com/top250?start={}&filter='
text = start_request(url)
parser(text)
if __name__ == '__main__':
main()
以上是之前参考一篇文章所写的爬取豆瓣250页面内容,今天又重新写了一遍,分为两版,一个是没有封装为函数,另一个封装成函数,这个小练习主要熟悉了beautifulsoap的用法,使用beautifulsoap先初步提取出网页全部源码,之后使用findall方法按照标签匹配相应内容,但是在提取出的内容并不完全是我想要的,所有再一次使用正则表达式进行精确提取,或者使用for循环、if语句筛选内容。提取到Excel部分的内容在另一篇文章中也写过,我基本上还是按照原来的方法做的,详见https://blog.csdn.net/xd060606/article/details/86503872,
未封装成函数的
import requests
import re#正则表达式 regular expression
from bs4 import BeautifulSoup
import xlwt
#爬取整体内容
url = 'https://movie.douban.com/top250'
res = requests.get(url)
soup = BeautifulSoup(res.text, "html.parser")
#对内容分类
#标题
movie_title = soup.find_all("span", "title")
movie_name = re.findall(r'.*?title">(.*?)<', str(movie_title))
# print(movie_name)
for i in movie_name:
if '\xa0/\xa0' in i :
movie_name.remove(i)
# print(movie_name)
#描述
movie_desc01 = soup.find_all( "p", class_= "")
movie_desc = []
for i in movie_desc01:
movie_desc.append(i.text.replace('</p>, <p class="">', '').replace('\n ', '').replace('\xa0','').replace('\n ',''))
# print(movie_desc)
#评分
movie_score01 = soup.find_all('span', class_='rating_num')
movie_score = re.findall(r'.*?average">(.*?)<', str(movie_score01))
# print(movie_score)
#标签
movie_quote01 = soup.find_all('span', class_= 'inq')
movie_quote = re.findall(r'.*?inq">(.*?)<', str(movie_quote01))
#写入Excel
wd = xlwt.Workbook()
ws = wd.add_sheet("douban_250")
ws.write(0, 0, '电影名')
ws.write(0, 1, '描述')
ws.write(0, 2, '分数')
ws.write(0, 3, '标签')
for i in range(len(movie_name)):
ws.write(i+1, 0, movie_name[i])
ws.write(i+1, 1, movie_desc[i])
ws.write(i+1, 2, movie_score[i])
ws.write(i+1, 3, movie_quote[i])
wd.save("douban.xls")
封装成函数
import requests
import re#正则表达式 regular expression
from bs4 import BeautifulSoup
import xlwt
def catch_title(soup):
movie_title = soup.find_all("span", "title")
movie_name = re.findall(r'.*?title">(.*?)<', str(movie_title))
# print(movie_name)
for i in movie_name:
if '\xa0/\xa0' in i :
movie_name.remove(i)
return movie_name
def catch_desc(soup):
movie_desc01 = soup.find_all( "p", class_= "")
movie_desc = []
for i in movie_desc01:
movie_desc.append(i.text.replace('</p>, <p class="">', '').replace('\n ', '').replace('\xa0','').replace('\n ',''))
return movie_desc
def catch_score(soup):
movie_score01 = soup.find_all('span', class_='rating_num')
movie_score = re.findall(r'.*?average">(.*?)<', str(movie_score01))
return movie_score
def catch_quote(soup):
movie_quote01 = soup.find_all('span', class_= 'inq')
movie_quote = re.findall(r'.*?inq">(.*?)<', str(movie_quote01))
return movie_quote
def save_excel(movie_name, movie_desc, movie_score, movie_quote):
wd = xlwt.Workbook()
ws = wd.add_sheet("douban_250")
ws.write(0, 0, '电影名')
ws.write(0, 1, '描述')
ws.write(0, 2, '分数')
ws.write(0, 3, '标签')
for i in range(len(movie_name)):
ws.write(i+1, 0, movie_name[i])
ws.write(i+1, 1, movie_desc[i])
ws.write(i+1, 2, movie_score[i])
ws.write(i+1, 3, movie_quote[i])
wd.save("douban02.xls")
def main():
url = 'https://movie.douban.com/top250'
res = requests.get(url)
soup = BeautifulSoup(res.text, "html.parser")
# print(soup)
name = catch_title(soup)
desc = catch_desc(soup)
score = catch_score(soup)
quote = catch_quote(soup)
save_excel(name, desc, score, quote)
if __name__ == "__main__":
main()
总体而言,推荐大家使用函数对内容进行封装,后期修改或者添加功能会更加清晰明了。