爬虫遇到了这样的问题
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='lndzj', port=443): Max retries exceeded with url: /xwzx/zxzq/lnjlqdz/2023062014023570131/index.shtml (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x00000247D42D9090>: Failed to resolve 'lndzj' ([Errno 11001] getaddrinfo failed)"))
一定要买一个代理IP才能解决吗,求助大佬
from lxml import etree
import numpy as np
import time
import pandas as pd
import re
import requests
def head_x(): # 设置请求头
header = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
'Cookie': 'aisteUv=16855165199141237697130; aisiteJsSessionId=1686714548634661824105',
}
return header
def html_link(): # 得到二级页面链接
list = []
for i in range(1, 2):
print('爬取第{}页'.format(i))
url = 'https://www.lndzj.gov.cn/lndzj/xwzx/zxzq/lnjlqdz/f43ea563-' + format(i) + str('.shtml')
# print(url)
head = head_x()
rq = requests.get(url, headers=head)
# print(rq)
html = etree.HTML(rq.content)
rqq = html.xpath('/html/body/div[@class="container"]/div[@class="row"]/div[@class="row col-md-9 '
'col-xs-12"]/div[@id="zwgk2"]/div/div[2]/ul[@class="article"]/li/a/@href')
rqq = 'https:/' + pd.Series(rqq)
list.extend(rqq)
time.sleep(np.random.randint(1, 3))
i=0
for item in list:
i=i+1
print(i,item)
#print(list)
return list
# def getip():#获取多个ip防止被封
# a = requests.get("https://www.xicidaili.com/nt/", headers=head_x())
# soup = BeautifulSoup(a.text, 'lxml')
# ips = soup.findAll('tr')
# proxy_list = []
# for x in range(1, len(ips)):
# ip = ips[x]
# tds = ip.findAll("td")
# ip_temp = 'http://' + tds[1].contents[0] + ":" + tds[2].contents[0]
# proxy_list.append(ip_temp)
#
# return proxy_list
# 上面已经获取了IP,下面是爬取目标网站
# run_times = 100000
# for i in range(run_times):
# for item in proxy_list:
# proxies = {
# 'http': item,
# 'https': item,
# }
# print(proxies)
# try:
#
# requests.get(i, proxies=proxies, timeout=1)
# print('ok')
# except:
# continue
def second_page(): # 二级子页面
list_link = html_link()#二级页面的链接
list_1 = []
for i in list_link:
print(i)
item = {}
rqqq = requests.get(i, headers=head_x())
print(rqqq)
time.sleep(np.random.randint(1, 3))
html = etree.HTML(rqqq.content)
print(html)
item['xiangqing'] = re.sub("\s", "", ",".join(
html.xpath(
'/html/body/div[@class="container"]/div[@class="row"]/div[@id="detailsPage"]/div[@id="90099936a1214212b6259c892a626843"]/div[2]/div[@class="gov_gxlbox minhh6"]/div[@class="gov_gxlmain"]/div[@class="gov_xlheader"]/div[@class="BSHARE_POP"]/div[@id="UCAP-CONTENT"]/div[@class="TRS_Editor"]/p')))
list_1.append(item)
print(item)
print(list_1)
return list_1
def save_data():
data = second_page()
data = pd.DataFrame(data)
data.to_csv('liaoning.csv', index=False)
return data
if __name__ == '__main__':
# main()
print(second_page())
print("chenggong")