爬取清华pypi源

爬虫下载清华pypi源

下载目录

文章代码仅供学术讨论。

import requests
import urllib.request  # url request
import os  # dirs
import time
from bs4 import BeautifulSoup  # 用于解析网页
import requests
import bs4
import tqdm
from retry import retry
import datetime
import sys



@retry(requests.exceptions.ConnectionError, delay=2)
def download_pypi_resource():
    text = requests.get("https://pypi.tuna.tsinghua.edu.cn/simple").text
    bs = bs4.BeautifulSoup(text, "lxml")
    context = bs.find_all("a")
    path = "C:\\Users\\82172\\PycharmProjects\\flaskTest\\common_test\\pypi.txt"
    with open(path, "w") as f:
        for i in tqdm.tqdm(range(len(context)), total=len(context), desc="进度"):
            f.write(str(context[i].text) + "\n")


def batch_download_pypi_resource():
    num = 30000
    name = 1
    text = requests.get("https://pypi.tuna.tsinghua.edu.cn/simple").text
    bs = bs4.BeautifulSoup(text, "lxml")
    context = bs.find_all("a")
    path = "C:\\dir_name\\"
    resource_list = []
    count = 0
    for i in tqdm.tqdm(range(len(context)), total=len(context), desc="进度"):
        resource_list.append(str(context[i].text))
        count += 1
        if count % num == 0:
            file_name = "{}{}".format(str(name), ".txt")
            name += 1
            with open("{}{}".format(path, file_name), "w") as f:
                for item in resource_list:
                    f.write(item + "\n")
            print("\n")
            print("{}文件中有资源{}个".format(file_name, len(resource_list)))
            resource_list = []
    # 将剩余的资源写入
    file_name = "{}{}".format(str(name), ".txt")
    with open("{}{}".format(path, file_name), "w") as f:
        for item in resource_list:
            f.write(item + "\n")
    print("\n")
    print("{}文件中有资源{}个".format(file_name, len(resource_list)))


# batch_download_pypi_resource()
# download_pypi_resource()

爬取程序

其中download_pypi_resource()为直接爬取目录,batch_download_pypi_resource()为将其拆分为3w一条写入文件,为了以后多开程序进行爬取。

def batchDownload(url_name, directory):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
    }
    # req = Request(url_name, headers=headers)
    html = requests.get(url_name, timeout=20).text
    bsObj = BeautifulSoup(html, 'html.parser')
    t1 = bsObj.find_all('a')
    url_list = []
    for t2 in t1:
        t3 = t2.get('href')
        url_list.append(t3)
    # set函数消除重复元素
    hset = set(url_list)

    # 下载链接
    for href in hset:
        # 之所以if else 是为了区别只有一个链接的特别情况

        link = url_name + href
        name = href.split("/")[-1].split("#")[0]
        filename = os.path.join(directory, name)
        # filename = directory
        # print(filename)
        # os.mkdir(filename)

        try:
            req = requests.get(url_name)
            with open(filename, "wb") as f:
                f.write(req.content)
            # urllib.request.urlretrieve(link, filename)
        except:
            with open("/data/logs.txt", "a") as f:
                f.write(name + "\n")
                f.close()

        # print("成功下载!")
        print("成功下载", filename)
        # 无sleep间隔,网站认定这种行为是攻击,反反爬虫
        time.sleep(1)


origin_url = 'https://pypi.tuna.tsinghua.edu.cn/simple/'
script_path = sys.argv[0]
origin_path = "./dir_name/2.txt"


file = open(origin_path)
count = 0
# 获取其中data_list
data_list = []
with open(origin_path, 'r') as f:
    for line in f.readlines():
        line = line.strip('\n')
        data_list.append(line)

try:
    with open(origin_path, 'r') as f:
        for line in f.readlines():
            dir_name = line.strip('\n')
            url = "{}{}{}".format(origin_url, line.strip('\n'), '/')
            print(dir_name)
            # file_path = os.path.join('./pypi_data1', dir_name)
            file_path = './pypi_data1'
            if not os.path.exists(file_path):
                os.mkdir(file_path)
            batchDownload(url, file_path)
            if dir_name in data_list:
                data_list.remove(dir_name)
except Exception as e:
    print("*" * 10)
    print(e)
    print("*" * 10)
finally:
    with open(origin_path, 'w') as f:
        for line in data_list:
            f.write(line + "\n")

最后,结合我写的监听爬虫程序自动重启博客,完成pypi源的爬取。

  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值