1、注意list是全局变量还是局部变量,不然存储的内容可能是重复的;
2、命名规则要规范一下,写得时候就按照顺序来,不然后续同名的太多,很容易报错,找bug很麻烦;
3、如果有爬出的内容有编码错误,要encoding一下;
# headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
# response = requests.get(url = path,headers = headers)
# # 设置编码格式,不设置就是乱码
# response.encoding = "utf-8"
# response = response.text
4、用driver.get()很有用,基本上都能爬取出来,就是比较慢一些;
driver = webdriver.Chrome()
# 向指定url发起请求
driver.get(path)
# 刷新页面,防止页面未渲染
driver.refresh()
# 等待页面渲染
time.sleep(2)
# 获取网页源代码
html = driver.page_source
# 退出模拟浏览器,防止进程残留
driver.quit()
soup = BeautifulSoup(html,'html.parser')
这段代码写了几个循环就是为了一层层获得要爬取网页的链接,这个链接直接获取不到,只能一层层拿到;
# 中国银行间市场交易商协会
import csv
import time
import requests
from bs4 import BeautifulSoup
import os
def get_html(path):
driver = webdriver.Chrome()
# 向指定url发起请求
driver.get(path)
# 刷新页面,防止页面未渲染
driver.refresh()
# 等待页面渲染
time.sleep(2)
# 获取网页源代码
html = driver.page_source
# 退出模拟浏览器,防止进程残留
driver.quit()
soup = BeautifulSoup(html,'html.parser')
return soup
if __name__=='__main__':
start = time.time()
# 第一层链接
path_1 = r'https://www.nafmii.org.cn/yj/jrscyj/xswyh/202112/t20211214_116082.html'
soup = get_html(path_1)
link_info_1 = soup.find_all('a')[30:39]
# print(link_info_1)
# link_list_1用来存第二层链接
link_list_1 = []
for link in link_info_1:
link_1 = link.get('href')
print(link_1)
# title =link.text
# 存在字典里可能更方便一些
# link_list[title] = link1
# print(title)
link_list_1.append(link_1)
print('link_list_1:%s'%link_list_1)
# 根据第二层链接获得第三层链接
link_list_2 = []
for link2 in link_list_1:
soup1 = get_html(link2)
link_info_2 = soup1.find_all('div',class_='text-cont')
# 得到链接,先缩小范围,然后进一步解析
for row_1 in link_info_2:
link_2 = row_1.find_all('a')[0]
link_2 = link_2.get('href').replace('./','')
# print(type(link_1))
path2 = os.path.join(link2,link_2)
link_list_2.append(path2)
print('link_list_2:%s'%link_list_2)
# 用来盛放第三层链接,也就是我们要爬取网页的链接
link_list_3 = []
for link3 in link_list_2:
soup2 = get_html(link3)
link_info_3 = soup2.find_all('div',class_='yh_sub_textList ty_list')
# print(link_info2)
# <div class="yh_sub_textList ty_list">
for row_2 in link_info_3:
row_2_1 = row_2.find_all('a')[0]
link4 = row_2_1.get('href').replace('./','')
# print(link4)
final_list = os.path.join(link3,link4)
link_list_3.append(final_list)
print(link_list_3)
fp = open(r'D:\test\zhognguoyinhangjianshichang1.csv','w',encoding = 'utf-8',newline='')
for final_link in link_list_3:
final_info = []
soup3 = get_html(final_link)
title = soup3.find_all('h1')[0].text+'\n'
fp.write(title)
main_info = soup3.find_all('div',class_="yh_viewTextBox")
# <div class="yh_viewTextBox">
for row_3 in main_info:
row_3_1 = row_3.find_all('tr')
for row_3_2 in row_3_1:
row_3_3 = row_3_2.find_all('td')
# 因为有第二列和第三列为空的情况,所以要分开写;
j_title = row_3_3[0].text.replace('\n','')
if len(row_3_3)==1:
name = ''
final_info.append([j_title,name])
if len(row_3_3)==2:
name = row_3_3[1].text.replace('\n','')
final_info.append([j_title,name])
if len(row_3_3)==3:
name = row_3_3[1].text.replace('\n','')
col3 = row_3_3[2].text.replace('\n','')
final_info.append([j_title,name,col3])
# print(final_info)
writer = csv.writer(fp)
writer.writerows(final_info)
fp.close()
end = time.time()
r_time = end-start
print(r_time)
print('finished!')
发现我挨个写td的做法不太简洁,完全没必要,直接用for循环就行,本质上也是用列表存列表,真傻我
# 中国上市协会行业委员会
import csv
import time
import os
from bs4 import BeautifulSoup
# 提取html
def get_html(path):
driver = webdriver.Chrome()
driver.get(path)
time.sleep(3)
driver.refresh()
html = driver.page_source
driver.quit()
soup = BeautifulSoup(html,'html.parser')
return soup
if __name__=='__main__':
# 第一层链接
# path = 'https://www.capco.org.cn/xhdt/hyzywyh/cwzjzywyh/index.html'
# soup = get_html(path)
# # print(soup)
# soup = soup.find_all('ul',class_='navList')
# # 新建一个list,用来放第一层链接
# url_list = []
# # 把第一个委员会的链接接存进去,方便一起操作
# url_list.append(path)
# # 第一个链接不用放,因为这个就是原来的链接,所以从第二个链接开始存储
# for row in soup:
# row = row.find_all('a')[1::]
# for url in row:
# link = url.get('href').replace('../','')
# pinjie = 'https://www.capco.org.cn/xhdt/hyzywyh/'
# fir_link = os.path.join(pinjie,link)
# url_list.append(fir_link)
# # print(fir_link)
# print(url_list)
# # 爬取第二层链接,这里面有个问题需要注意一下,如何判断委员会名单的信息
# for link1 in url_list:
# soup1 = get_html(path)
# link_info = soup1.find_all('ul',class_='listWrap mt20')
# for row in link_info:
# print(row.text)
s = time.time()
list1 = ['https://www.capco.org.cn/hyxx/fhz/index.html','https://www.capco.org.cn/hyxx/jsh/jshcy/','https://www.capco.org.cn/xhdt/hyzywyh/cwzjzywyh/201909/20190920/j_2019092017422400015689725955897856.html','https://www.capco.org.cn/xhdt/hyzywyh/dldswyh/201904/20190418/j_2019041817540900015689727295348545.html','https://www.capco.org.cn/xhdt/hyzywyh/bgrzwyh/201909/20190920/j_2019092017483500015689729664587942.html','https://www.capco.org.cn/xhdt/hyzywyh/dshmswyh/201909/20190920/j_2019092017501500015689730664697968.html','https://www.capco.org.cn/xhdt/hyzywyh/jshzywyh/201909/20190920/j_2019092017513300015689731443997993.html','https://www.capco.org.cn/xhdt/hyzywyh/tzzglwyh/201909/20190920/j_2019092017524700015689732182128018.html','https://www.capco.org.cn/xhdt/hyzywyh/xsgwwyh/202206/20220607/j_2022060714184200016545827360056820.html','https://www.capco.org.cn/xhdt/hyzywyh/szw/202206/20220622/j_2022062221114900016559035224143357.html','https://www.capco.org.cn/xhdt/hyzywyh/xnyyznqcwyh/202210/20221028/j_2022102809352000016669209337028831.html']
print(list1)
final_info = []
for url in list1:
soup = get_html(url)
soup = soup.find_all('div',class_='artical')
for row in soup:
title = row.find_all('h3')[0].text+'\n'
final_info.append([title])
print(title)
main_info = row.find_all('tr')
for cell in main_info:
cell = cell.find_all('td')
main = []
for td in cell:
td = td.text
# print(td)
main.append(td)
final_info.append(main)
fp = open(r'D:\test\shangshixiehui-xiehui.csv','w',encoding = 'utf-8',newline='')
writer = csv.writer(fp)
writer.writerows(final_info)
e = time.time()
run = e-s
fp.close()
print('runtime is %d'%run)
print('finished!')