"""
"""
import urllib.request
from bs4 import BeautifulSoup
'''
定义:
爬取 指定网页中,所有的新闻url地址
'''
#############################################
## 直接操作
# ## 定义需要爬取的网站URL
# url = "https://www.huxiu.com"
#
# ## 获取虎嗅网站首页的内容
# index_page = urllib.request.urlopen(url).read().decode("utf-8")
#
# ## 解析内容
# # 创建一个Beautiful对象,用来格式化处理内容
# soup= BeautifulSoup(index_page, features="html.parser")
# # 过滤 所有文章url (文章的url都是a标签)
# a_list= soup.select("a[href]")
# # 定义一个 空set集合, 用来存放首页中所有的url
# url_set = set()
#
# # 获取 a标签中,href属性的值 (for 循环遍历 a标签数组集合,筛选出自己所需的信息)
# for a_tag in a_list:
# #获取 标签中属性值方法: 标签["属性名称"]
# href = a_tag["href"]
# if "article" in href and "http" not in href and "/1.html" not in href:
# # if "article" in href :
# # 由于解析出的url,有重复元素,所以需要用set集合特性,进行去重操作;
# url_set.add(href)
# print(url_set)
#############################################
#############################################
## 封装为方法操作
def parse_index_page(url):
## 获取虎嗅网站首页的内容
index_page = urllib.request.urlopen(url).read().decode("utf-8")
## 解析内容
# 创建一个Beautiful对象,用来格式化处理内容
soup= BeautifulSoup(index_page,features="html.parser")
# 过滤 所有文章url (文章的url都是a标签)
a_list= soup.select("a[href]")
# 定义一个 空set集合, 用来存放首页中所有的url
url_set = set()
# 获取 a标签中,href属性的值 (for 循环遍历 a标签数组集合,筛选出自己所需的信息)
for a_tag in a_list:
#获取 标签中属性值方法: 标签["属性名称"]
href = a_tag["href"]
# if "article" in href and "http" not in href and "1.html" not in href:
if "article" in href :
# 由于解析出的url,有重复元素,所以需要用set集合特性,进行去重操作;
url_set.add(href)
return url_set
## 自定义方法调用
## 定义需要爬取的网站URL
url = "https://www.huxiu.com"
set_out = parse_index_page(url)
print(set_out)
############################
## 打印结果
## {'/article/428399.html', '/article/428437.html', '/article/428313.html', '/article/428377.html', '/article/428223.html', '/article/428429.html', '/article/427018.html', '/article/428035.html', '/article/427798.html', '/article/428246.html', '/article/428422.html', '/article/428292.html', '/article/428365.html', '/article/427933.html', '/article/428252.html', '/article/427936.html', '/article/428104.html', '/article/427495.html', '/article/428425.html', '/article/428434.html', '/article/428295.html', '/article/428289.html', '/article/428346.html', '/article/428413.html', '/article/428250.html', '/article/428332.html', '/article/428358.html', '/article/428301.html', '/article/428448.html', '/article/428218.html', '/article/428202.html', '/article/428245.html', '/article/', '/article/428293.html', '/article/428369.html', '/article/428201.html'}