1. 爬取数据
1.1 导入以下模块
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from openpyxl import Workbook, load_workbook
1.2 获取每页电影链接
def getonepagelist(url,headers):
try:
r = requests.get(url, headers=headers, timeout=10)
r.raise_for_status()
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text, 'html.parser')
lsts = soup.find_all(attrs={'class': 'hd'})
for lst in lsts:
href = lst.a['href']
time.sleep(0.5)
getfilminfo(href, headers)
except:
print('getonepagelist error!')
1.3 获取每部电影具体信息
def getfilminfo(url,headers):
filminfo = []
r = requests.get(url, headers=headers, timeout=10)
r.raise_for_status()
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text, 'html.parser')
1.4 保存数据
def insert2excel(filepath,allinfo):
try:
if not os.path.exists(filepath):
tableTitle = ['片名','上映年份','评分','评价人数&