from lxml import etree
import requests
import re
import json
def getTreeData(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
data = requests.get(url, headers=headers).content.decode("utf-8", "ignore")
treeData = etree.HTML(data)
newID = treeData.xpath('//div[@class="Newslist"]/ul/li/@id')
for id in newID:
time = treeData.xpath('//li[@id="' + id + '"]/span/text()')
print("新闻的时间:" + str(time[0]))
title = treeData.xpath('//li[@id="' + id + '"]/a/text()')
print("新闻的标题:" + str(title[0]))
link1 = treeData.xpath('//li[@id="' + id + '"]/a/@href')
link = "http://news.pdsu.edu.cn"+link1[0]
print("新闻的连接:" + link)
def getImageData(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
data = requests.get(url, headers=headers).content.decode("utf-8", "ignore")
treeData = etree.HTML(data)
newID = treeData.xpath('//div[@class="Pic1"]/ul/li/@id')
for id in newID:
title = treeData.xpath('//li[@id="' + id + '"]/a/text()')
print("新闻的标题:" + str(title[0]))
link1 = treeData.xpath('//li[@id="' + id + '"]/a/@href')
link = "http://news.pdsu.edu.cn" + link1[0]
print("新闻的连接:" + link)
def getPage(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
data = requests.get(url, headers=headers).content.decode("utf-8", "ignore")
treeData = etree.HTML(data)
page = treeData.xpath('//div[@class="pb_sys_common pb_sys_normal pb_sys_style1"]//span[@class="p_t"]/text()')
pat = '\d*'
page = re.compile(pat).findall(page[1])
return page[1]
def schoolNews():
url = "http://news.pdsu.edu.cn/xyxw.htm"
page = getPage(url)
print("第----------1----------页")
getTreeData(url)
index = 2
for j in range(int(page) - 1, 1, -1):
print("第----------" + str(index) + "----------页")
thisurl = 'http://news.pdsu.edu.cn/xyxw/' + str(j) + '.htm'
getTreeData(thisurl)
index = index + 1
def baseDynamic():
url = "http://news.pdsu.edu.cn/jcdt.htm"
page = getPage(url)
print("第----------1----------页")
getTreeData(url)
index = 2
for j in range(int(page) - 1, 1, -1):
print("第----------" + str(index) + "----------页")
thisurl = 'http://news.pdsu.edu.cn/jcdt/' + str(j) + '.htm'
getTreeData(thisurl)
index = index + 1
def newsImage():
url = "http://news.pdsu.edu.cn/tpxw.htm"
page = getPage(url)
print("第----------1----------页")
getTreeData(url)
index = 2
for j in range(int(page) - 1, 1, -1):
print("第----------" + str(index) + "----------页")
thisurl = 'http://news.pdsu.edu.cn/tpxw/' + str(j) + '.htm'
getImageData(thisurl)
index = index + 1
if __name__ == '__main__':
schoolNews()
baseDynamic()
newsImage()
爬取的结果