from bs4 import BeautifulSoup
from urllib.parse import urljoin
import urllib.request
import urllib.parse
class Spider(object):
#获取1到10页源码
def main(self):
# urls = []
for i in range(1, 10):
url='https://www.jhc.cn/4548/list' + str(i) + '.htm'
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
# print(html)
# print(url)
self.load_page(html)
# 获取1到10页中所有的url
def load_page(self, html):
soup = BeautifulSoup(html, 'lxml')
divs = soup.select('.Article_Title') #select类名查找..Article_Title
for div in divs:
ul = div.select('a')[0].get('href') #select标签名查找,获取href
if ul.startswith('/2020/'): #判断如果url中包含/2020/
uls.append(urljoi
python爬取校园新闻
最新推荐文章于 2024-05-30 10:08:42 发布