python爬虫许可证

本文介绍了一个使用Python的BeautifulSoup和requests库实现的开源许可证爬虫项目。该项目旨在从opensource.org网站抓取各种开源许可证的详细信息,包括许可证名称、内容等。通过解析HTML并利用正则表达式和BeautifulSoup的功能,爬虫能够有效地抓取所需数据,并将其保存为文本文件。
摘要由CSDN通过智能技术生成
import re
from bs4 import BeautifulSoup
import random

# html = urlopen("https://opensource.org/licenses/alphabetical").read().decode('utf-8')
# 被淘汰的许可证的链接
# https://opensource.org/licenses/do-not-use

# res = re.findall(r"<h1 class=\"page-title\"> (.+?)</h1>",html)
# soup = BeautifulSoup(html,features='lxml')
# print(soup.h1)
# print("\n",res[0])

# all_href = soup.find_all('li',{'class':'field-item even'})
# for l in all_href:
#     print(l.get_text())

# jan = soup.find('ul',{'class':'xxxx'})
# d_jan = jan.find_all('li')
# for d in d_jan:
#     print(d.get_text())
# 爬取一个协议的协议内容
# all_href = soup.find_all('p')
# for l in all_href:
#     print(l.get_text())
# all_href1 = soup.find_all('pre')
# if all_href1!=None:
#     for l1 in all_href1:
#         print(l1.get_text())

# course_links = soup.find_all('a',{'href': re.compile('/licenses/*')})
# for link in course_links:
#     if link['href'].startswith('/licenses/'):
#         print("https://opensource.org" + link['href'])

# 方式二 post 和 get
# import requests
# # import webbrowser
# param ={"ws":"sdlkjf"}
# r = requests.get("http://www.baidu.com/s",params=param)
# print(r.url)
# webbrowser.open(r.url)

# import scrapy

#获取所有许可证的链接
def get_mulu_detail(urls):
    html = requests.get(urls).text
    # print(html)
    soup = BeautifulSoup(html, 'lxml')
    book_mulu = soup.find('div', class_='field-item even')
    li_list = book_mulu.find_all('li')
    detail_list = []
    for li in li_list:
        li_href = 'https://opensource.org' + li.find('a')['href']
        detail_list.append(li_href)

    return detail_list

#获取每个许可证的内容
def get_content(urls):
    content = []
    for url in urls:
        html = requests.get(url).text
        # print(html)
        soup = BeautifulSoup(html, 'lxml')
        container = soup.find('div', class_='field-item even')
        container = container.text.replace('\n', '').replace('\xa0', '')
        content.append(container)
    return content


def save_license(contents, license_name):
    filename = license_name + '.txt'
    for content in contents:
        with open(filename, 'a', encoding='utf-8') as f:
            f.write(content)



def main():
    get_license()
    # datas = get_license()
    # for data in datas:
    #     license_href = data['license_href']
    #     license_name = data['license_name']
    #     print(license_name + ":" + license_href)
    #     sleep(2)
    #     detail_href = get_mulu_detail(license_href)
        # sleep(2)
        # contents = get_content(detail_href)
        # # print(license_name + ":\n" + contents)
        # sleep(2)
        # save_license(contents, license_name)


if __name__ == '__main__':
    main()

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值