浙交通新闻页带写文件

from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
from urllib.request import urlopen
import requests
from lxml import etree
import json

# def load_page(url):
#     headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
#     request=urllib.request.Request(url,headers=headers)
#     return urllib.request.urlopen(request).read().decode('utf-8')
def headers_page():
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
    request = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(request)
    return request,response
def tran_url(url):
    xiuzheng_url = [ ]
    for xiuzheng in url:
        if xiuzheng.startswith('../..'):
            xiuzheng_url.append('http://www.zjvtit.edu.cn//' + xiuzheng.lstrip('../..'))
        #                 print('问题1')
        elif xiuzheng.startswith('../'):
            xiuzheng_url.append('http://www.zjvtit.edu.cn//' + xiuzheng.lstrip('../'))
        #                 print('问题2')
        else:
            print('没啥问题')
    print(xiuzheng_url)
    return xiuzheng_url
def huoquyeshu_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
    request = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(request)
    # html = response.read().decode('utf-8')
    html=response.read().decode('utf-8')
    s = etree.HTML(html)
    result=s.xpath('//*[@id="fanye126908"]/text()')
    # print(result)
    # print(type(result))
    # print(len(result))
    result_str = [ ]
    for c in result:
        result_str.append(c)
    #     print(c)
    # print(len(c))
    yeshu = c[ -4:-1 ]
    # print(yeshu)
    return yeshu
def neirong(url):
    #     url = url

    if url == '':
        pass
    else:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
        res = requests.get(url,headers=headers)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')

        biaoti = soup.select('.zw-title')[0].text
        # zuozhe = soup.select('.zz')[ 0 ].text
        time = soup.select('.zw-other')[0].text
        result = {}

        # author = zuozhe.split()[ 0 ].lstrip('作者:')
        # resource = zuozhe.split()[ 1 ].lstrip('来源:')
        timesource = time.split('浏')[ 0 ].strip().lstrip('发布时间:')

        result[ 'title' ] = biaoti
        # result[ 'author' ] = author
        # result[ 'resource' ] = resource
        result[ 'timesource' ] = timesource
        result[ 'article' ] = ' '.join([ p.text.strip() for p in soup.select('.zw p')[ :-1 ] ])

    return  result
def shouye_spider():
    list_content_1=[]
    url = 'http://www.zjvtit.edu.cn//xwzx/jyxw.htm'
    print('首页是' + url)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
    request = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(request)
    # html = response.read().decode('utf-8')
    html = response.read().decode('utf-8')
    s = etree.HTML(html)
    for d in range(0, 20):
        # r = urlopen(url)
        # data = requests.get(url)
        # data.encoding = 'utf-8'
        # s = etree.HTML(data.text)
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
        request = urllib.request.Request(url, headers=headers)
        response = urllib.request.urlopen(request)
        # html = response.read().decode('utf-8')
        html = response.read().decode('utf-8')
        s = etree.HTML(html)
        r = urlopen(request)

        result_xpath_biaoti = '//*[@id="line_u9_' + str(d) + '"]' + '/a/text()'
        result_xpath_url = '//*[@id="line_u9_' + str(d) + '"]' + '/a/@href'
        result_pachong = s.xpath(result_xpath_url)
        print(result_pachong)
        result_pachong = s.xpath(result_xpath_url)
        result_pachong = tran_url(result_pachong)
        list_content_1.append(neirong("".join(result_pachong)))
    return list_content_1
def fanye_page(yeshu, paquyeshu):
    list_content_2=[]
    for b in range(eval(yeshu) - paquyeshu + 1, eval(yeshu)):
        url = 'http://www.zjvtit.edu.cn//xwzx/jyxw/'
        data = b
        url = url + str(data) + '.htm'
        print('第' + str(eval(yeshu) - b) + '页是' + url)
        s = etree.HTML(url)
        for d in range(0, 20):
            # r = urlopen(url)
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
            request = urllib.request.Request(url, headers=headers)
            response = urllib.request.urlopen(request)
            # html = response.read().decode('utf-8')
            html = response.read().decode('utf-8')
            s = etree.HTML(html)
            r = urlopen(request)
            # data = requests.get(url)
            # data.encoding = 'utf-8'
            # s = etree.HTML(data.text)
            #         print(r.read().decode('utf-8'))
            result_xpath_biaoti = '//*[@id="line_u9_' + str(d) + '"]' + '/a/text()'
            result_xpath_url = '//*[@id="line_u9_' + str(d) + '"]' + '/a/@href'
            result_pachong = s.xpath(result_xpath_url)
            xiuzheng_url = [ ]
            #         print('标记')
            #         print(type(result_pachong))

            for xiuzheng in result_pachong:
                if xiuzheng.startswith('../..'):
                    xiuzheng_url.append('http://www.zjvtit.edu.cn//' + xiuzheng.lstrip('../..'))
                #                 print('问题1')
                elif xiuzheng.startswith('../'):
                    xiuzheng_url.append('http://www.zjvtit.edu.cn//' + xiuzheng.lstrip('../'))
                #                 print('问题2')
                else:
                    print('没啥问题')
            print(xiuzheng_url)
            list_content_2.append(neirong("".join(xiuzheng_url)))
    return list_content_2
    #             result_pachong=s.xpath(result_xpath_biaoti)
    #             print(result_pachong)
    #             global back_urlandbiaoti
    #             back_urlandbiaotik=result_xpath_url




if __name__ == '__main__':
    url = 'http://www.zjvtit.edu.cn/xwzx/jyxw.htm'
    print('正在爬取浙江交通职业技术学院相关信息')
    paquyeshu = eval(input('请输入要爬取多少页:'))
    # huoquyeshu_page(url)
    list_content_fanye=[]
    list_content_shouye=[]
    list_content_fanye.append(fanye_page(huoquyeshu_page(url), paquyeshu))
    print(list_content_fanye)
    list_content_shouye.append(shouye_spider())
    print(list_content_shouye)
    # for i in list_content[0]:
    #     print(i)
    #     print(type(i))
    # print(list_content)
    with open('zjjt.txt','w',encoding='utf-8') as file:
        for i  in list_content_shouye[0]:
            file.write(json.dumps(i,ensure_ascii=False)+'\n')
            # file.write(i)
        for j  in list_content_fanye[0]:
            file.write(json.dumps(j,ensure_ascii=False)+'\n')
            # file.write(i)
        file.close()
    # with open('zjjt.txt','w',encoding='utf-8') as file:
    #     for i  in list_content_fanye[0]:
    #         file.write(json.dumps(i,ensure_ascii=False)+'\n')
    #         # file.write(i)
    #     file.close()
    # load_page(url)
    # print(load_page(url))
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值