交院1

from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
from urllib.request import urlopen
import requests
from lxml import etree

# def load_page(url):
#     headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
#     request=urllib.request.Request(url,headers=headers)
#     return urllib.request.urlopen(request).read().decode('utf-8')
def write_pagew(file1):
    with open('pachong.txt','w',encoding='utf-8') as file:
        file.write(file1)
    file.close()
def write_pagea(file1):
    with open('pachong.txt','a',encoding='utf-8') as file:
        file.write(file1)
    file.close()
def headers_page():
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
    request = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(request)
    return request,response
def tran_url(url):
    xiuzheng_url = [ ]
    for xiuzheng in url:
        if xiuzheng.startswith('../..'):
            xiuzheng_url.append('http://www.zjvtit.edu.cn//' + xiuzheng.lstrip('../..'))
        #                 print('问题1')
        elif xiuzheng.startswith('../'):
            xiuzheng_url.append('http://www.zjvtit.edu.cn//' + xiuzheng.lstrip('../'))
        #                 print('问题2')
        else:
            print('没啥问题')
    print(xiuzheng_url)
    return xiuzheng_url
def huoquyeshu_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
    request = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(request)
    # html = response.read().decode('utf-8')
    html=response.read().decode('utf-8')
    s = etree.HTML(html)
    result=s.xpath('//*[@id="fanye126908"]/text()')
    # print(result)
    # print(type(result))
    # print(len(result))
    result_str = [ ]
    for c in result:
        result_str.append(c)
    #     print(c)
    # print(len(c))
    yeshu = c[ -4:-1 ]
    # print(yeshu)
    return yeshu
def neirong(url):
    #     url = url
    if url == '':
        pass
    else:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
        res = requests.get(url,headers=headers)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')

        biaoti = soup.select('.zw-title')[0].text
        # zuozhe = soup.select('.zz')[ 0 ].text
        time = soup.select('.zw-other')[0].text
        result = {}

        # author = zuozhe.split()[ 0 ].lstrip('作者:')
        # resource = zuozhe.split()[ 1 ].lstrip('来源:')
        timesource = time.split('浏')[ 0 ].strip().lstrip('发布时间:')

        result[ 'title' ] = biaoti
        # result[ 'author' ] = author
        # result[ 'resource' ] = resource
        result[ 'timesource' ] = timesource
        result[ 'article' ] = ' '.join([ p.text.strip() for p in soup.select('.zw p')[ :-1 ] ])

        print(result)
def shouye_spider():
    url = 'http://www.zjvtit.edu.cn/xwzx/jyxw.htm'
    print('首页是' + url)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
    request = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(request)
    # html = response.read().decode('utf-8')
    html = response.read().decode('utf-8')
    s = etree.HTML(html)
    for d in range(0, 20):
        # r = urlopen(url)
        # data = requests.get(url)
        # data.encoding = 'utf-8'
        # s = etree.HTML(data.text)
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
        request = urllib.request.Request(url, headers=headers)
        response = urllib.request.urlopen(request)
        # html = response.read().decode('utf-8')
        html = response.read().decode('utf-8')
        s = etree.HTML(html)
        r = urlopen(request)

        result_xpath_biaoti = '//*[@id="line_u9_' + str(d) + '"]' + '/a/text()'
        result_xpath_url = '//*[@id="line_u9_' + str(d) + '"]' + '/a/@href'
        result_pachong = s.xpath(result_xpath_url)
        # print(result_pachong)
        result_pachong = s.xpath(result_xpath_url)
        result_pachong = tran_url(result_pachong)
        neirong("".join(result_pachong))
def fanye_page(yeshu, paquyeshu):
    for b in range(eval(yeshu) - paquyeshu + 1, eval(yeshu)):
        url = 'http://www.zjvtit.edu.cn/xwzx/jyxw/'
        data = b
        url = url + str(data) + '.htm'
        print('第' + str(eval(yeshu) - b) + '页是' + url)
        s = etree.HTML(url)
        for d in range(0, 20):
            # r = urlopen(url)
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
            request = urllib.request.Request(url, headers=headers)
            response = urllib.request.urlopen(request)
            # html = response.read().decode('utf-8')
            html = response.read().decode('utf-8')
            s = etree.HTML(html)
            r = urlopen(request)
            # data = requests.get(url)
            # data.encoding = 'utf-8'
            # s = etree.HTML(data.text)
            #         print(r.read().decode('utf-8'))
            result_xpath_biaoti = '//*[@id="line_u9_' + str(d) + '"]' + '/a/text()'
            result_xpath_url = '//*[@id="line_u9_' + str(d) + '"]' + '/a/@href'
            result_pachong = s.xpath(result_xpath_url)
            xiuzheng_url = [ ]
            #         print('标记')
            #         print(type(result_pachong))

            for xiuzheng in result_pachong:
                if xiuzheng.startswith('../..'):
                    xiuzheng_url.append('http://www.zjvtit.edu.cn//' + xiuzheng.lstrip('../..'))
                #                 print('问题1')
                elif xiuzheng.startswith('../'):
                    xiuzheng_url.append('http://www.zjvtit.edu.cn//' + xiuzheng.lstrip('../'))
                #                 print('问题2')
                else:
                    print('没啥问题')
            print(xiuzheng_url)
            neirong("".join(xiuzheng_url))
    #             result_pachong=s.xpath(result_xpath_biaoti)
    #             print(result_pachong)
    #             global back_urlandbiaoti
    #             back_urlandbiaotik=result_xpath_url

if __name__ == '__main__':
    url = 'http://www.zjvtit.edu.cn/xwzx/jyxw.htm'
    print('正在爬取浙江交通职业技术学院相关信息')
    paquyeshu = eval(input('请输入要爬取多少页:'))
    # huoquyeshu_page(url)
    fanye_page(huoquyeshu_page(url), paquyeshu)
    shouye_spider()
    # load_page(url)
    # print(load_page(url))
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值