平院新闻爬取

from lxml import etree
import requests
import re
import json

def getTreeData(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
    data = requests.get(url, headers=headers).content.decode("utf-8", "ignore")
    treeData = etree.HTML(data)
    newID = treeData.xpath('//div[@class="Newslist"]/ul/li/@id')
    for id in newID:
        time = treeData.xpath('//li[@id="' + id + '"]/span/text()')
        print("新闻的时间:" + str(time[0]))
        title = treeData.xpath('//li[@id="' + id + '"]/a/text()')
        print("新闻的标题:" + str(title[0]))
        link1 = treeData.xpath('//li[@id="' + id + '"]/a/@href')
        link = "http://news.pdsu.edu.cn"+link1[0]
        print("新闻的连接:" + link)

def getImageData(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
    data = requests.get(url, headers=headers).content.decode("utf-8", "ignore")
    treeData = etree.HTML(data)
    newID = treeData.xpath('//div[@class="Pic1"]/ul/li/@id')
    for id in newID:
        title = treeData.xpath('//li[@id="' + id + '"]/a/text()')
        print("新闻的标题:" + str(title[0]))
        link1 = treeData.xpath('//li[@id="' + id + '"]/a/@href')
        link = "http://news.pdsu.edu.cn" + link1[0]
        print("新闻的连接:" + link)

def getPage(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
    data = requests.get(url, headers=headers).content.decode("utf-8", "ignore")
    treeData = etree.HTML(data)
    page = treeData.xpath('//div[@class="pb_sys_common pb_sys_normal pb_sys_style1"]//span[@class="p_t"]/text()')
    pat = '\d*'
    page = re.compile(pat).findall(page[1])
    return page[1]

def schoolNews():
    url = "http://news.pdsu.edu.cn/xyxw.htm"
    page = getPage(url)
    print("第----------1----------页")
    getTreeData(url)
    index = 2
    for j in range(int(page) - 1, 1, -1):
        print("第----------" + str(index) + "----------页")
        thisurl = 'http://news.pdsu.edu.cn/xyxw/' + str(j) + '.htm'
        getTreeData(thisurl)
        index = index + 1

def baseDynamic():
    url = "http://news.pdsu.edu.cn/jcdt.htm"
    page = getPage(url)
    print("第----------1----------页")
    getTreeData(url)
    index = 2
    for j in range(int(page) - 1, 1, -1):
        print("第----------" + str(index) + "----------页")
        thisurl = 'http://news.pdsu.edu.cn/jcdt/' + str(j) + '.htm'
        getTreeData(thisurl)
        index = index + 1

def newsImage():
    url = "http://news.pdsu.edu.cn/tpxw.htm"
    page = getPage(url)
    print("第----------1----------页")
    getTreeData(url)
    index = 2
    for j in range(int(page) - 1, 1, -1):
        print("第----------" + str(index) + "----------页")
        thisurl = 'http://news.pdsu.edu.cn/tpxw/' + str(j) + '.htm'
        getImageData(thisurl)
        index = index + 1

if __name__ == '__main__':
    schoolNews()
    baseDynamic()
    newsImage()

爬取的结果
在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值