爬取论文

获取代理IP

from bs4 import BeautifulSoup
import requests
import random

def get_ip_list(url, headers):
    web_data = requests.get(url, headers=headers)
    soup = BeautifulSoup(web_data.text, 'lxml')
    ips = soup.find_all('tr')
    ip_list = []
    for i in range(1, len(ips)):
        ip_info = ips[i]
        tds = ip_info.find_all('td')
        ip_list.append(tds[1].text + ':' + tds[2].text)
    return ip_list

def get_random_ip(ip_list):
    proxy_list = []
    for ip in ip_list:
        proxy_list.append('http://' + ip)
    proxy_ip = random.choice(proxy_list)
    proxies = {'http': proxy_ip}
    return proxies

# if __name__ == '__main__':
#     url = 'http://www.xicidaili.com/nn/'
#     headers = {
#             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
#         }
#     ip_list = get_ip_list(url, headers=headers)
#     proxies = get_random_ip(ip_list)
#     print(proxies)

主代码

import requests,os
from bs4 import BeautifulSoup
from datetime import datetime
from 项目.ip代理 import get_ip_list,get_random_ip
# python2
# from requests.packages.urllib3.exceptions import InsecureRequestWarning
# # 禁用安全请求警告
# requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
def ClassificationPapers(url):
    L = []  # 保存所有的分类地址
    proxies=ip()
    reponse=requests.get(url, proxies=proxies)
    # print(proxies)
    reponse.encoding='gbk'
    soup=BeautifulSoup(reponse.text,'lxml')
    div=soup.find(id='subnav')
    if div:
        a_list=div.find_all(name='a')
        for a in a_list:
            href=a.get('href')
            L.append({a.text:first_https+href})
    return L
# print(L)
#拿到每个文章的内容地址
def Papers(L):
    for https in L:
        if https:
            for k,v in https.items():
                proxies = ip()
                reponse1=requests.get(v,proxies=proxies)
                # print(proxies)
                reponse1.encoding='gbk'
                soup1=BeautifulSoup(reponse1.text,'lxml')
                div=soup1.find(id='articlelist')
                ul=div.find(name='ul')
                li_list=ul.find_all(name='li')
                for li in li_list:
                    a=li.find(name='a')
                    href=a.get('href')
                    lunwen_list.append({a.text:href})
    return lunwen_list
def AnalyticalPapers(lunwen_list,x=0,y=0):
    p_lists=[]
    for i in lunwen_list:
        for m,n in i.items():
            proxies = ip()
            reponse2=requests.get(n,proxies=proxies)
            # print(proxies)
            reponse2.encoding='gbk'
            soup2=BeautifulSoup(reponse2.text,'lxml')
            div=soup2.find('div',{'id':'content'})
            #获取图片和文件名称
            div1=soup2.find('div',{'id':'article'})
            title=div1.find(name='h1').text
            #找图片
            img=div.find(name='img')
            if img:
                src=img.get('src')
                if src:
                    pictureName=title+'.png'
                    # proxies = ip()
                    picture=requests.get(first_https+src,proxies=proxies)
                    print(proxies)
                    filepath_pictureName='D:\\PROJECT\picture\\'+pictureName
                    if not os.path.isfile(filepath_pictureName):
                        with open(filepath_pictureName,'wb')as f:
                            f.write(picture.content)
                        x+=1
                        print(datetime.now().strftime("%Y-%m-%d %H:%M:%S ")+'    写入第{}个图片成功.......'.format(x),'图片名: '+title)
                    else:
                        print(datetime.now().strftime("%Y-%m-%d %H:%M:%S ") + '  图片已存在,正在重新运行','图片名: '+title)

            # 找段落
            p_list=div.find_all(name='p')
            p_lists.append(p_list)
            # div1=soup2.find('div',{'id':'article'})
            # title=div1.find(name='h1').text
            filepath_name = 'D:\\PROJECT\lunwen\\' + title + '.txt'
            flage = True
            if not os.path.isfile(filepath_name):
                for p in p_list:
                    with open(filepath_name,'a+',encoding="utf-8") as f1:
                        while flage:
                            f1.write(title + '\n')
                            flage=False
                        f1.write(p.text+'\n')
                y+=1
                print(datetime.now().strftime("%Y-%m-%d %H:%M:%S ")+'    写入第{}个文件成功'.format(y),'文件名: '+title)
            else:
                print(datetime.now().strftime("%Y-%m-%d %H:%M:%S ")+title +  '文件已存在,正在重新运行')
    # print(p_lists)
    return p_lists,x,y

def fun(p_lists):
    href_list=[]
    for p_list in p_lists:
        for p in p_list:
            a_list=p.find_all(name='a')
            # print(a_list)
            for a in a_list:
                if a=="":
                    continue
                else:
                    href=a.get('href')
                    href_list.append({a.text:href})
    return href_list

def ip():
    url = 'http://www.xicidaili.com/nn/'
    headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
        }
    ip_list = get_ip_list(url, headers=headers)
    proxies = get_random_ip(ip_list)
    return proxies

if __name__=='__main__':
    # header={
    #         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    #         'Accept-Encoding': 'gzip, deflate, br',
    #         'Accept-Language':'zh-CN,zh;q=0.9',
    #         'Connection': 'close',
    #         'Cookie': 'BAIDUID=919FB8CEF5692A814DD7436D01B8E0FE:FG=1; BIDUPSID=919FB8CEF5692A814DD7436D01B8E0FE; PSTM=1554559379; MCITY=-340%3A; __cfduid=d14f04dc95d4c7db585310c0e1f07ab331568031553; BDUSS=zFXOGd1VlMtelZ1UkxXd29TYWJQaX5-OXBGY1M1VDhPa3FDa0ZCWGhpLTlVNTlkSVFBQUFBJCQAAAAAAAAAAAEAAACgA3uvbHZqdW5ibzEzNDA0NwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAL3Gd129xnddM; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDSFRCVID=tjKsJeCCxG3jBe6wAp1ZHCXuPodtJnXXd9GB3J; H_BDCLCKID_SF=tRk8oK-atDvbfP0k54cHh-7H-UnLqb3BW57Z0lOnMp05jloNjRJNK5_ly-bv-lOy5TnZWfn95ITnECO_e4bK-TrXjG7P; H_PS_PSSID=1435_21080_29523_29721_29567_29221_22160; delPer=0; PSINO=6; locale=zh',
    #         'Referer': 'https://pos.baidu.com/wh/o.htm?ltr=',
    #         'Upgrade-Insecure-Requests': '1',
    #         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
    # }

    lunwen_list = []  # 保存所有文章地址
    first_https = "https://www.lunwendata.com"
    L=ClassificationPapers("https://www.lunwendata.com/thesis/List_6.html")
    lunwen_list=Papers(L)
    p_lists,x,y=AnalyticalPapers(lunwen_list)
    href_list=fun(p_lists)
    # print(href_list)
    for href in href_list:
        for k,v in href.items():
            L = ClassificationPapers(v)
            lunwen_list = Papers(L)
            p_lists,x,y= AnalyticalPapers(lunwen_list,x,y)
    print(p_lists)


    # AnalyticalPapers(Papers(ClassificationPapers()))




评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值