import requests from bs4 import BeautifulSoup import pandas as pd from pandas import DataFrame import openpyxl import lxml def get_url(url): res = requests.get(url) soup = BeautifulSoup(res.content, 'lxml') #出现乱码,用二进制 return soup A=[] B=[] url="http://www.csrc.gov.cn/pub/zjhpublic/3300/3313/index_7401" #ajax渲染,要在network的response找到文件所在的url j=0 for i in range(0,54): if i==0: URL = url+".htm" else: URL=url+"_"+str(i)+".htm" # print(URL) soup=get_url(URL) for item in soup.select("a"): # j=j+1 # print(j) # print(item) A.append(item.get_text()) B.append("http://www.csrc.gov.cn/pub/zjhpublic" + item["href"][5:]) # print(A) # print(B) data={"处罚书": A, "地址": B} # print(data) df=DataFrame(data) pd.set_option('max_colwidth', 200) #变量太长,Dateframe不能显示 print(df) df.to_excel("处罚案例.xlsx")