python爬虫许可证

最新推荐文章于 2024-10-08 14:30:12 发布

老王偷人啦

最新推荐文章于 2024-10-08 14:30:12 发布

阅读量947

点赞数 1

分类专栏：开源许可证

本文链接：https://blog.csdn.net/qq_28266311/article/details/84548441

版权

开源许可证专栏收录该内容

5 篇文章

订阅专栏

本文介绍了一个使用Python的BeautifulSoup和requests库实现的开源许可证爬虫项目。该项目旨在从opensource.org网站抓取各种开源许可证的详细信息，包括许可证名称、内容等。通过解析HTML并利用正则表达式和BeautifulSoup的功能，爬虫能够有效地抓取所需数据，并将其保存为文本文件。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

import re
from bs4 import BeautifulSoup
import random

# html = urlopen("https://opensource.org/licenses/alphabetical").read().decode('utf-8')
# 被淘汰的许可证的链接
# https://opensource.org/licenses/do-not-use

# res = re.findall(r"<h1 class=\"page-title\"> (.+?)</h1>",html)
# soup = BeautifulSoup(html,features='lxml')
# print(soup.h1)
# print("\n",res[0])

# all_href = soup.find_all('li',{'class':'field-item even'})
# for l in all_href:
#     print(l.get_text())

# jan = soup.find('ul',{'class':'xxxx'})
# d_jan = jan.find_all('li')
# for d in d_jan:
#     print(d.get_text())
# 爬取一个协议的协议内容
# all_href = soup.find_all('p')
# for l in all_href:
#     print(l.get_text())
# all_href1 = soup.find_all('pre')
# if all_href1!=None:
#     for l1 in all_href1:
#         print(l1.get_text())

# course_links = soup.find_all('a',{'href': re.compile('/licenses/*')})
# for link in course_links:
#     if link['href'].startswith('/licenses/'):
#         print("https://opensource.org" + link['href'])

# 方式二 post 和 get
# import requests
# # import webbrowser
# param ={"ws":"sdlkjf"}
# r = requests.get("http://www.baidu.com/s",params=param)
# print(r.url)
# webbrowser.open(r.url)

# import scrapy

#获取所有许可证的链接
def get_mulu_detail(urls):
    html = requests.get(urls).text
    # print(html)
    soup = BeautifulSoup(html, 'lxml')
    book_mulu = soup.find('div', class_='field-item even')
    li_list = book_mulu.find_all('li')
    detail_list = []
    for li in li_list:
        li_href = 'https://opensource.org' + li.find('a')['href']
        detail_list.append(li_href)

    return detail_list

#获取每个许可证的内容
def get_content(urls):
    content = []
    for url in urls:
        html = requests.get(url).text
        # print(html)
        soup = BeautifulSoup(html, 'lxml')
        container = soup.find('div', class_='field-item even')
        container = container.text.replace('\n', '').replace('\xa0', '')
        content.append(container)
    return content


def save_license(contents, license_name):
    filename = license_name + '.txt'
    for content in contents:
        with open(filename, 'a', encoding='utf-8') as f:
            f.write(content)



def main():
    get_license()
    # datas = get_license()
    # for data in datas:
    #     license_href = data['license_href']
    #     license_name = data['license_name']
    #     print(license_name + ":" + license_href)
    #     sleep(2)
    #     detail_href = get_mulu_detail(license_href)
        # sleep(2)
        # contents = get_content(detail_href)
        # # print(license_name + ":\n" + contents)
        # sleep(2)
        # save_license(contents, license_name)


if __name__ == '__main__':
    main()