刚学完python基础知识,但在兼职时被让去关注某些网页消息,抄抄补补尝试写了个爬虫。可是只能爬某一个网页,因为不同网址,属性不一样,我只好针对着来写。后面再看看有没有什么好的改进办法。
def getnews(src):
base_url = re.search(".+\.cn",src).group()
headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
#有的会拒绝访问
html = request.Request(src,headers=headers)
html = request.urlopen(html).read().decode('utf-8')#网页原始编码格式
soup = BeautifulSoup(html,features='lxml')
url_h = soup.find_all(attrs="ncother",\
text = re.compile(datetime.datetime.now().strftime('%Y-%m-20')))#获取今日日期,当然,可以改的
for i in range(0,len(url_h)):
url = url_h[i]
url = url.previous_sibling.previous_sibling#得到网址前半部
title = url.a["title"]
url = base_url+url.a["href"]#得到地址
with open('C:\\Users\\user\\Desktop\\0721.txt', 'a',encoding='utf-8') as f:##注意编码方式
f.write(url)
f.write('\n')
f.write(title)
f.write('\n\n')
getnews("http://cdst.gov.cn/Type.asp?TypeID=47&BigClassID=181")#address[0]
7-23
今天试了分别处理的方式爬多个网站。大概的样子出来了,虽然还有很多改进的,就先这样吧。后面再改。
功能:爬取政府网站通知列表当天消息,保存title和网址到txt中。
from urllib import error
from urllib import request
from bs4 import BeautifulSoup
import datetime
import re
import time
import threading
def getnews(soup, i, data): # 传入soup和地址序号
base_url = re.search(".+\.cn",webad[i]).group()
news = []
if i == 0:
url_time = soup.find_all(attrs="ncother",text=data)
for j in range(0, len(url_time)):
url = url_time[j].previous_sibling.previous_sibling # 得到前边的
news.append(url.a["title"]) #是li下的a
news.append(base_url + url.a["href"])# 得到地址
elif i == 1:
url_time = soup.find_all("span", attrs={"style": "color:#999; float:right;"},
text=data)#这里不加attrs有两个结果。。。
for j in range(0, len(url_time)):
url = url_time[j].next_sibling
news.append(url["title"])
news.append(base_url + url["href"])
elif i == 2:
url_time = soup.find_all('font', text=data)
for j in range(0, len(url_time)):
url = url_time[j].parent
news.append(url["title"])
news.append(base_url + url["href"])
elif i == 3:
url_time = soup.find_all('span', text=data)
for j in range(0, len(url_time)):
url = url_time[j].next_sibling.next_sibling
news.append(url["title"])
news.append(base_url + url["href"])
elif i == 4:
url_time = soup.find_all('span', text=data)
for j in range(0, len(url_time)):
url = url_time[j].previous_sibling.previous_sibling # next_sibling.next_sibling
news.append(url.a.text)
news.append(base_url + url.a["href"])
elif i == 5:
url_time = soup.find_all('span', text=data)
for j in range(0, len(url_time)):
url = url_time[j].previous_sibling.previous_sibling # next_sibling.next_sibling
news.append(url.a.text)
news.append(base_url + url.a["href"])#可能出错
elif i == 6:
url_time = soup.find_all('span', text=data)#前面正常
for j in range(0, len(url_time)):
url = url_time[j].previous_sibling #next_sibling.next_sibling
news.append(url.text) #对比url["href"]有什么区别?
news.append(base_url + url["href"])
elif i == 7:
url_time = soup.find_all('span', text=data)
for j in range(0, len(url_time)):
url = url_time[j].previous_sibling # next_sibling.next_sibling
news.append(url.text)
news.append(base_url + url["href"]) # 可能出错
elif i == 8:
url_time = soup.find_all('td', text=data) ##td里面 'td', text=re.compile(datetime.datetime.now().strftime('7-20')))
for j in range(0, len(url_time)):
url = url_time[j].previous_sibling.previous_sibling.previous_sibling.previous_sibling # next_sibling.next_sibling#。。。
news.append(url.a.text)
news.append(url.a["href"]) # 有base_url了
elif i == 9:
url_time = soup.find_all('span', text=data)
for j in range(0, len(url_time)):
url = url_time[j].previous_sibling.previous_sibling # next_sibling.next_sibling
news.append(url.text) # 对比url["href"]有什么区别?
news.append(base_url + url["href"])
elif i == 10:
url_time = soup.find_all('i', text=data) ###i
for j in range(0, len(url_time)):
url = url_time[j].previous_sibling # next_sibling.next_sibling
news.append(url.text) # 对比url["href"]有什么区别?
news.append(base_url + url["href"])
else:
print('Error,out if newslist space')
exit()
return news #???
def crawler(ad, i, data, headers, num=1):
print(ad)
html = request.Request(ad, headers=headers)
try:
html = request.urlopen(html).read().decode('utf-8') # 这儿好像有个属性确定是什么编码
except UnicodeDecodeError as e:
print("decoding Error,trying to decode by \"gbk\":", e.reason,'\n',ad)
html = request.urlopen(html).read().decode('gbk')
except error.URLError as e:
print('HttpError:', e.reason,'\n',ad) # braak之间出来了
#封成函数后就不能continue了
return#函数退出
soup = BeautifulSoup(html, features='lxml')
news = getnews(soup, i, data)
with open('C:\\Users\\user\\Desktop\\0720.txt', 'a', encoding='utf-8') as f: ##注意它的encode,遇到转码问题!!
for k in range(0, len(news)): # 应该可以一起写的
if k % 2 == 0:
f.write(str(num)) # news序号
f.write(':')
num += 1
f.write(str.strip(news[k])) # 可以用str方式,后添
if k % 2 == 1:
f.write('\n\n\n')
#********************************************************
if __name__=='__main__':
webad=["http://www.cdgy.gov.cn/cdsjxw/c132946/zwxx.shtml",#0
"http://www.scst.gov.cn/tz/index.jhtml",
"http://www.scjg.gov.cn/filelist_1_10.html",
"http://www.cdst.gov.cn/Type.asp?TypeID=22&BigClassID=41",
"http://www.scjm.gov.cn/scjxw/ggtz/common_list.shtml",#4
"http://www.scdrc.gov.cn/sfgw/tzgg/list.shtml",#5
"http://www.sccom.gov.cn/tzgg",
"http://www.cdmbc.gov.cn/index.php?cid=11",#7 慢
"http://www.cdht.gov.cn/zwgktzgg/index.jhtml",#8 慢+ 网站毒,href包括base_url
"http://www.cdibi.org.cn/article/nlist?id=symUM",#9慢+
"http://cdhtip.cn/tzgg/",
#"http://www.cdkjfw.com/list/second.html?Id=cNo9gOuhLFp7BkHXapNMnQ==",#没有读取这个
]
headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
#拒绝访问,加这个?伪装
num = 1#news序号
data = re.compile(datetime.datetime.now().strftime('%m-19'))#日期设置,注意格式了
for i in range(0, len(webad)):
t = threading.Thread(target=crawler, args=(webad[i], i, data, headers, num))#依次是地址,爬的网址序号,日期,编号,header信息
t.start()#开启多线程