获取代理IP
from bs4 import BeautifulSoup
import requests
import random
def get_ip_list(url, headers):
web_data = requests.get(url, headers=headers)
soup = BeautifulSoup(web_data.text, 'lxml')
ips = soup.find_all('tr')
ip_list = []
for i in range(1, len(ips)):
ip_info = ips[i]
tds = ip_info.find_all('td')
ip_list.append(tds[1].text + ':' + tds[2].text)
return ip_list
def get_random_ip(ip_list):
proxy_list = []
for ip in ip_list:
proxy_list.append('http://' + ip)
proxy_ip = random.choice(proxy_list)
proxies = {'http': proxy_ip}
return proxies
# if __name__ == '__main__':
# url = 'http://www.xicidaili.com/nn/'
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
# }
# ip_list = get_ip_list(url, headers=headers)
# proxies = get_random_ip(ip_list)
# print(proxies)
主代码
import requests,os
from bs4 import BeautifulSoup
from datetime import datetime
from 项目.ip代理 import get_ip_list,get_random_ip
# python2
# from requests.packages.urllib3.exceptions import InsecureRequestWarning
# # 禁用安全请求警告
# requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
def ClassificationPapers(url):
L = [] # 保存所有的分类地址
proxies=ip()
reponse=requests.get(url, proxies=proxies)
# print(proxies)
reponse.encoding='gbk'
soup=BeautifulSoup(reponse.text,'lxml')
div=soup.find(id='subnav')
if div:
a_list=div.find_all(name='a')
for a in a_list:
href=a.get('href')
L.append({a.text:first_https+href})
return L
# print(L)
#拿到每个文章的内容地址
def Papers(L):
for https in L:
if https:
for k,v in https.items():
proxies = ip()
reponse1=requests.get(v,proxies=proxies)
# print(proxies)
reponse1.encoding='gbk'
soup1=BeautifulSoup(reponse1.text,'lxml')
div=soup1.find(id='articlelist')
ul=div.find(name='ul')
li_list=ul.find_all(name='li')
for li in li_list:
a=li.find(name='a')
href=a.get('href')
lunwen_list.append({a.text:href})
return lunwen_list
def AnalyticalPapers(lunwen_list,x=0,y=0):
p_lists=[]
for i in lunwen_list:
for m,n in i.items():
proxies = ip()
reponse2=requests.get(n,proxies=proxies)
# print(proxies)
reponse2.encoding='gbk'
soup2=BeautifulSoup(reponse2.text,'lxml')
div=soup2.find('div',{'id':'content'})
#获取图片和文件名称
div1=soup2.find('div',{'id':'article'})
title=div1.find(name='h1').text
#找图片
img=div.find(name='img')
if img:
src=img.get('src')
if src:
pictureName=title+'.png'
# proxies = ip()
picture=requests.get(first_https+src,proxies=proxies)
print(proxies)
filepath_pictureName='D:\\PROJECT\picture\\'+pictureName
if not os.path.isfile(filepath_pictureName):
with open(filepath_pictureName,'wb')as f:
f.write(picture.content)
x+=1
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S ")+' 写入第{}个图片成功.......'.format(x),'图片名: '+title)
else:
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S ") + ' 图片已存在,正在重新运行','图片名: '+title)
# 找段落
p_list=div.find_all(name='p')
p_lists.append(p_list)
# div1=soup2.find('div',{'id':'article'})
# title=div1.find(name='h1').text
filepath_name = 'D:\\PROJECT\lunwen\\' + title + '.txt'
flage = True
if not os.path.isfile(filepath_name):
for p in p_list:
with open(filepath_name,'a+',encoding="utf-8") as f1:
while flage:
f1.write(title + '\n')
flage=False
f1.write(p.text+'\n')
y+=1
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S ")+' 写入第{}个文件成功'.format(y),'文件名: '+title)
else:
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S ")+title + '文件已存在,正在重新运行')
# print(p_lists)
return p_lists,x,y
def fun(p_lists):
href_list=[]
for p_list in p_lists:
for p in p_list:
a_list=p.find_all(name='a')
# print(a_list)
for a in a_list:
if a=="":
continue
else:
href=a.get('href')
href_list.append({a.text:href})
return href_list
def ip():
url = 'http://www.xicidaili.com/nn/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
}
ip_list = get_ip_list(url, headers=headers)
proxies = get_random_ip(ip_list)
return proxies
if __name__=='__main__':
# header={
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
# 'Accept-Encoding': 'gzip, deflate, br',
# 'Accept-Language':'zh-CN,zh;q=0.9',
# 'Connection': 'close',
# 'Cookie': 'BAIDUID=919FB8CEF5692A814DD7436D01B8E0FE:FG=1; BIDUPSID=919FB8CEF5692A814DD7436D01B8E0FE; PSTM=1554559379; MCITY=-340%3A; __cfduid=d14f04dc95d4c7db585310c0e1f07ab331568031553; BDUSS=zFXOGd1VlMtelZ1UkxXd29TYWJQaX5-OXBGY1M1VDhPa3FDa0ZCWGhpLTlVNTlkSVFBQUFBJCQAAAAAAAAAAAEAAACgA3uvbHZqdW5ibzEzNDA0NwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAL3Gd129xnddM; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDSFRCVID=tjKsJeCCxG3jBe6wAp1ZHCXuPodtJnXXd9GB3J; H_BDCLCKID_SF=tRk8oK-atDvbfP0k54cHh-7H-UnLqb3BW57Z0lOnMp05jloNjRJNK5_ly-bv-lOy5TnZWfn95ITnECO_e4bK-TrXjG7P; H_PS_PSSID=1435_21080_29523_29721_29567_29221_22160; delPer=0; PSINO=6; locale=zh',
# 'Referer': 'https://pos.baidu.com/wh/o.htm?ltr=',
# 'Upgrade-Insecure-Requests': '1',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
# }
lunwen_list = [] # 保存所有文章地址
first_https = "https://www.lunwendata.com"
L=ClassificationPapers("https://www.lunwendata.com/thesis/List_6.html")
lunwen_list=Papers(L)
p_lists,x,y=AnalyticalPapers(lunwen_list)
href_list=fun(p_lists)
# print(href_list)
for href in href_list:
for k,v in href.items():
L = ClassificationPapers(v)
lunwen_list = Papers(L)
p_lists,x,y= AnalyticalPapers(lunwen_list,x,y)
print(p_lists)
# AnalyticalPapers(Papers(ClassificationPapers()))