这次爬取是爬取250部电影的相关内容,分别用了requests请求url,正则表达式re与BeautifulSoup作为内容过滤
openpyxl作为excel的操作模块,本人为才学不久的新手,代码编写有点无脑和啰嗦,希望有大神能多提建议
首先,代码清单如下:
import requests
import re
from bs4 import BeautifulSoup
import openpyxl
def get_movie_top250_name(soup):
targets = soup.find_all('span',class_="title") #用BeautifulSoup找寻一个内容为一个列表
targets_name = re.findall(r'.*?title">(.*?)<\/span',str(targets)) #用正则表达式去掉标签
for each in targets_name: #剔除targets_name当中的别名
if '\xa0' in each:
targets_name.remove(each)
return targets_name
def get_movie_top250_workers(soup):
targets = soup.find_all('p',class_="")
targets_workers = []
for each in targets:
targets_workers.append(each.text.replace('<p class="">','').replac