1.python爬虫学习——通知消息提取

刚学完python基础知识,但在兼职时被让去关注某些网页消息,抄抄补补尝试写了个爬虫。可是只能爬某一个网页,因为不同网址,属性不一样,我只好针对着来写。后面再看看有没有什么好的改进办法。

def getnews(src):
    base_url = re.search(".+\.cn",src).group()
    
    headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
    #有的会拒绝访问
    html = request.Request(src,headers=headers)
    html = request.urlopen(html).read().decode('utf-8')#网页原始编码格式
    
    soup = BeautifulSoup(html,features='lxml')
    url_h = soup.find_all(attrs="ncother",\
                          text = re.compile(datetime.datetime.now().strftime('%Y-%m-20')))#获取今日日期,当然,可以改的
     for i in range(0,len(url_h)):
        url = url_h[i]
        url =  url.previous_sibling.previous_sibling#得到网址前半部
        title = url.a["title"]
        url = base_url+url.a["href"]#得到地址

        with open('C:\\Users\\user\\Desktop\\0721.txt', 'a',encoding='utf-8') as f:##注意编码方式
            f.write(url)
            f.write('\n')
            f.write(title)
            f.write('\n\n')

getnews("http://cdst.gov.cn/Type.asp?TypeID=47&BigClassID=181")#address[0]

7-23

今天试了分别处理的方式爬多个网站。大概的样子出来了,虽然还有很多改进的,就先这样吧。后面再改。

功能:爬取政府网站通知列表当天消息,保存title和网址到txt中。

from urllib import error
from urllib import request
from bs4 import BeautifulSoup
import datetime
import re
import time
import threading

def getnews(soup, i, data):  # 传入soup和地址序号
    base_url = re.search(".+\.cn",webad[i]).group()
    news = []

    if i == 0:
        url_time = soup.find_all(attrs="ncother",text=data)
        for j in range(0, len(url_time)):
            url = url_time[j].previous_sibling.previous_sibling  # 得到前边的
            news.append(url.a["title"]) #是li下的a
            news.append(base_url + url.a["href"])# 得到地址
    elif i == 1:
        url_time = soup.find_all("span", attrs={"style": "color:#999; float:right;"},
                              text=data)#这里不加attrs有两个结果。。。
        for j in range(0, len(url_time)):
            url = url_time[j].next_sibling
            news.append(url["title"])
            news.append(base_url + url["href"])
    elif i == 2:
        url_time = soup.find_all('font', text=data)
        for j in range(0, len(url_time)):
            url = url_time[j].parent
            news.append(url["title"])
            news.append(base_url + url["href"])
    elif i == 3:
        url_time = soup.find_all('span', text=data)
        for j in range(0, len(url_time)):
            url = url_time[j].next_sibling.next_sibling
            news.append(url["title"])
            news.append(base_url + url["href"])
    elif i == 4:
        url_time = soup.find_all('span', text=data)
        for j in range(0, len(url_time)):
            url = url_time[j].previous_sibling.previous_sibling  # next_sibling.next_sibling
            news.append(url.a.text)
            news.append(base_url + url.a["href"])
    elif i == 5:
        url_time = soup.find_all('span', text=data)
        for j in range(0, len(url_time)):
            url = url_time[j].previous_sibling.previous_sibling  # next_sibling.next_sibling
            news.append(url.a.text)
            news.append(base_url + url.a["href"])#可能出错
    elif i == 6:
        url_time = soup.find_all('span', text=data)#前面正常
        for j in range(0, len(url_time)):
            url = url_time[j].previous_sibling  #next_sibling.next_sibling
            news.append(url.text) #对比url["href"]有什么区别?
            news.append(base_url + url["href"])
    elif i == 7:
        url_time = soup.find_all('span', text=data)
        for j in range(0, len(url_time)):
            url = url_time[j].previous_sibling  # next_sibling.next_sibling
            news.append(url.text)
            news.append(base_url + url["href"])  # 可能出错

    elif i == 8:
        url_time = soup.find_all('td', text=data)  ##td里面  'td', text=re.compile(datetime.datetime.now().strftime('7-20')))
        for j in range(0, len(url_time)):
            url = url_time[j].previous_sibling.previous_sibling.previous_sibling.previous_sibling  # next_sibling.next_sibling#。。。
            news.append(url.a.text)
            news.append(url.a["href"])  # 有base_url了
    elif i == 9:
        url_time = soup.find_all('span', text=data)
        for j in range(0, len(url_time)):
            url = url_time[j].previous_sibling.previous_sibling  # next_sibling.next_sibling
            news.append(url.text)  # 对比url["href"]有什么区别?
            news.append(base_url + url["href"])
    elif i == 10:
        url_time = soup.find_all('i', text=data)  ###i
        for j in range(0, len(url_time)):
            url = url_time[j].previous_sibling  # next_sibling.next_sibling
            news.append(url.text) # 对比url["href"]有什么区别?
            news.append(base_url + url["href"])
    else:
        print('Error,out if newslist space')
        exit()

    return news  #???

def crawler(ad, i, data, headers, num=1):
    print(ad)
    html = request.Request(ad, headers=headers)
    try:
        html = request.urlopen(html).read().decode('utf-8')  # 这儿好像有个属性确定是什么编码
    except UnicodeDecodeError as e:
        print("decoding Error,trying to decode by \"gbk\":", e.reason,'\n',ad)
        html = request.urlopen(html).read().decode('gbk')
    except error.URLError as e:
        print('HttpError:', e.reason,'\n',ad)  # braak之间出来了
        #封成函数后就不能continue了
        return#函数退出
    soup = BeautifulSoup(html, features='lxml')
    news = getnews(soup, i, data)
    with open('C:\\Users\\user\\Desktop\\0720.txt', 'a', encoding='utf-8') as f:  ##注意它的encode,遇到转码问题!!
        for k in range(0, len(news)):  # 应该可以一起写的
            if k % 2 == 0:
                f.write(str(num))  # news序号
                f.write(':')
                num += 1
            f.write(str.strip(news[k]))  # 可以用str方式,后添
            if k % 2 == 1:
                f.write('\n\n\n')


#********************************************************
if __name__=='__main__':

    webad=["http://www.cdgy.gov.cn/cdsjxw/c132946/zwxx.shtml",#0
    "http://www.scst.gov.cn/tz/index.jhtml",
    "http://www.scjg.gov.cn/filelist_1_10.html",
    "http://www.cdst.gov.cn/Type.asp?TypeID=22&BigClassID=41",
    "http://www.scjm.gov.cn/scjxw/ggtz/common_list.shtml",#4
    "http://www.scdrc.gov.cn/sfgw/tzgg/list.shtml",#5
    "http://www.sccom.gov.cn/tzgg",
    "http://www.cdmbc.gov.cn/index.php?cid=11",#7 慢
    "http://www.cdht.gov.cn/zwgktzgg/index.jhtml",#8 慢+  网站毒,href包括base_url
    "http://www.cdibi.org.cn/article/nlist?id=symUM",#9慢+
    "http://cdhtip.cn/tzgg/",
    #"http://www.cdkjfw.com/list/second.html?Id=cNo9gOuhLFp7BkHXapNMnQ==",#没有读取这个
           ]

    headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
    #拒绝访问,加这个?伪装
    num = 1#news序号
    data = re.compile(datetime.datetime.now().strftime('%m-19'))#日期设置,注意格式了

    for i in range(0, len(webad)):
        t = threading.Thread(target=crawler, args=(webad[i], i, data, headers, num))#依次是地址,爬的网址序号,日期,编号,header信息
        t.start()#开启多线程

 

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值