直接上代码了这次把代码完全优化好了
import re import urllib.request from bs4 import BeautifulSoup def deal(list): list=replacebilank(list) list =replacestr(list) return list def replacebilank(list):#去空格 New_list = [] for i in list: New_list.append(i.replace(" ", "")) return New_list def replacestr(list):#去无效字符 New_list = [] for i in list: New_list.append(i.replace(" ", "")) return New_list def gethtml():#获取html for i in range(0,10): url="https://movie.douban.com/top250?start="+str(i*25) headers={ "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36" } req=urllib.request.Request(url=url,headers=headers) response=urllib.request.urlopen(req) html=response.read().decode("utf-8") html+=html return html gethtml(); def required_compile(str):#返回正则表达式内容 html = gethtml() listgaint=[] p4 = re.compile(str) # 获取演员信息 for four in p4.findall(html): listgaint.append(four) listgaint = deal(listgaint) return listgaint def required_deta():#返回需求集合 listname=required_compile(r'alt="(.*?)" src="') listjpg=required_compile(r'src="(.*?)" class=') listreword=required_compile(r'class="other"> / (.*?)<') listgaint=required_compile(r' (.*)...<br>') required_details = [] for i in range(len(listname)): # 将获得的信息存储在required_details = [] required_details.append([listname[i], listjpg[i], listreword[i], listgaint[i]]) print(required_details) required_deta()
这下代码爽多了吧,想要什么在required_deta()函数里面添加正则表达式就行了, 快去试试吧!