import re
from bs4 import BeautifulSoup
import random
# html = urlopen("https://opensource.org/licenses/alphabetical").read().decode('utf-8')
# 被淘汰的许可证的链接
# https://opensource.org/licenses/do-not-use
# res = re.findall(r"<h1 class=\"page-title\"> (.+?)</h1>",html)
# soup = BeautifulSoup(html,features='lxml')
# print(soup.h1)
# print("\n",res[0])
# all_href = soup.find_all('li',{'class':'field-item even'})
# for l in all_href:
# print(l.get_text())
# jan = soup.find('ul',{'class':'xxxx'})
# d_jan = jan.find_all('li')
# for d in d_jan:
# print(d.get_text())
# 爬取一个协议的协议内容
# all_href = soup.find_all('p')
# for l in all_href:
# print(l.get_text())
# all_href1 = soup.find_all('pre')
# if all_href1!=None:
# for l1 in all_href1:
# print(l1.get_text())
# course_links = soup.find_all('a',{'href': re.compile('/licenses/*')})
# for link in course_links:
# if link['href'].startswith('/licenses/'):
# print("https://opensource.org" + link['href'])
# 方式二 post 和 get
# import requests
# # import webbrowser
# param ={"ws":"sdlkjf"}
# r = requests.get("http://www.baidu.com/s",params=param)
# print(r.url)
# webbrowser.open(r.url)
# import scrapy
#获取所有许可证的链接
def get_mulu_detail(urls):
html = requests.get(urls).text
# print(html)
soup = BeautifulSoup(html, 'lxml')
book_mulu = soup.find('div', class_='field-item even')
li_list = book_mulu.find_all('li')
detail_list = []
for li in li_list:
li_href = 'https://opensource.org' + li.find('a')['href']
detail_list.append(li_href)
return detail_list
#获取每个许可证的内容
def get_content(urls):
content = []
for url in urls:
html = requests.get(url).text
# print(html)
soup = BeautifulSoup(html, 'lxml')
container = soup.find('div', class_='field-item even')
container = container.text.replace('\n', '').replace('\xa0', '')
content.append(container)
return content
def save_license(contents, license_name):
filename = license_name + '.txt'
for content in contents:
with open(filename, 'a', encoding='utf-8') as f:
f.write(content)
def main():
get_license()
# datas = get_license()
# for data in datas:
# license_href = data['license_href']
# license_name = data['license_name']
# print(license_name + ":" + license_href)
# sleep(2)
# detail_href = get_mulu_detail(license_href)
# sleep(2)
# contents = get_content(detail_href)
# # print(license_name + ":\n" + contents)
# sleep(2)
# save_license(contents, license_name)
if __name__ == '__main__':
main()