前言
好久没有更新文章了,主要是自己也在学习过程中,没有学明白也不好意思记录,很多大佬写的更清楚,更明白。最近快要毕业了,整理整理以前的代码,看到居然还有人关注我,虽然只是一个,emmm,于是打算再写写。做个教程试的代码仓库,避免自己忘记,也可以让像我一样的初学者少走点弯路。
思路
这个爬虫程序是很久以前写的,只是最简单的一种爬虫,只要入门都写的出来,思路也很简单,就是访问网站,获得返回的网页代码,用beautiful soup筛选出有天气信息的哪部分代码,然后提取出相关的数据。其实这中简单的提取,用正则表达式(import re)可能更简单,毕竟用一个新包,还得学习半天,但用正则,就更普遍。
这三段代码基本一样,只是怕不同的网站而已,你完全可以稍微改改爬自己喜欢的信息。
但这种思路方法,只能爬普通静态网站,对于动态和有反爬措施的网站无能为力。
虽然很简单,但也很好玩,哈哈,轻喷轻喷。
开发环境
使用了BeautifulSoup包来提取信息,用urllib.request
运行结果
天气
贴吧
学校信息
爬天气源码
import urllib.request
from bs4 import BeautifulSoup
tianqi_headers ="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
re_weather_range = re.compile(r"[^\u4e00-\u9fa5]+")#温度范围
def get_url(name):
TB_url=f"https://www.tianqi.com/{name}"
return TB_url
def get_today(url, header=tianqi_headers):
req = urllib.request.Request(url)
req.add_header("User-Agent", tianqi_headers)
req.add_header("GET", url)
content = urllib.request.urlopen(req).read()
return content.decode('utf-8')
def get_week(url, header=tianqi_headers):
req = urllib.request.Request(url+"/7")
req.add_header("User-Agent", tianqi_headers)
req.add_header("GET", url)
content = urllib.request.urlopen(req).read()
return content.decode('utf-8')
def get_content(name):
url = get_url(name)
today = get_today(url)
week = get_week(url)
soup1 = BeautifulSoup(today, 'lxml')
soup2 = BeautifulSoup(week, 'lxml')
Today = soup1.find('dl', attrs={'class': 'weather_info'})
Week = soup2.find('div', attrs={'class': 'weatherbox2'} )
if len(today)>=100 and len(Today)==0:print("识别帖子失败")
if len(today)<100:print("访问失败")
weather = []
try:
weather_today={}
the_weather= Today.find('dd', attrs={'class': 'weather'})
weather_today['天气']= the_weather.find_all(text=True)[5]
weather_today['此时温度'] = "".join( the_weather.find_all(text=True)[2:4])
weather_today['温度范围']= the_weather.find_all(text=True)[6]
shidu = Today.find('dd', attrs={'class': 'shidu'})
weather_today['湿度'] = shidu.find_all(text=True)[0].split(':')[1]
weather_today['风向'] = shidu.find_all(text=True)[1].split(':')[1]
weather_today['紫外线'] = shidu.find_all(text=True)[2].split(':')[1]
kongqi = Today.find('dd', attrs={'class': 'kongqi'})
weather_today['空气质量'] = kongqi.find_all(text=True)[0].split(':')[1].strip()
weather_today['PM'] = kongqi.find_all(text=True)[1].split(':')[1].strip()
weather_today['日出时间']=kongqi.find_all(text=True)[2][4:]
weather_today['日落时间']=kongqi.find_all(text=True)[3][4:]
weather.append(weather_today)
except:
print('获取今日天气出错')
for day in Week.find_all('a'):
try:
infor={}
infor["日期"] = day.find_all(text=True)[2]
infor["星期"] = day.find_all(text=True)[4]
infor["空气质量"] = day.find_all(text=True)[6]
infor["天气"] = day.find_all(text=True)[9]
infor["温度范围"] = "".join(day.find_all(text=True)[11:14])
infor["风向"] = ("".join(day.find_all(text=True)[15])).split(" ")[0]
infor["风力"] = ("".join(day.find_all(text=True)[15])).split(" ")[1]
weather.append(infor)
except:
print('获取本周天气出错')
return weather
def myAlign(string,strnum=5,zhnum=5):
num1,num2=0,0
kong=""
for i in string:
if i >= u'\u4e00' and i <= u'\u9fa5':num1+=1
else:num2+=1
if zhnum-num1>0:kong+=u' '*(zhnum-num1)
if strnum-num2>0:kong+=' '*(strnum-num2)
return string+kong
def showWeather(city):
infor=get_content(city)
print("\033[1;33;40m\033")
print(f"————————————————今日天气{city}——————————————————————————")
print(f"{infor[0]['此时温度']}({infor[0]['温度范围']}) {infor[0]['风向']} ")
print(f"空气:{infor[0]['空气质量']}({infor[0]['PM']}) {infor[0]['天气']}({infor[0]['湿度']}) ")
print(f"日出:{infor[0]['日出时间']} 日落:{infor[0]['日落时间']} 紫外线:{infor[0]['紫外线']}")
print(r"————————————————————————————————————————————————————————")
print(r"————————————————未来一周天气—————————————————————————————————")
for i in range(1,8):
print(f"{infor[i]['日期']} { myAlign(infor[i]['星期'],0,3)} { myAlign(infor[i]['天气'],0,6)} { myAlign(infor[i]['风向'],0,3)}{myAlign(infor[i]['风力'],2,2)} { myAlign(infor[i]['温度范围'],11,0)} { myAlign(infor[i]['空气质量'],0,2)} ")
showWeather("taiyuan")
爬贴吧
import requests,random
from bs4 import BeautifulSoup
agents = [ 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1',
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)'
]
proxy = [ "http://116.211.143.11:80",
"http://183.1.86.235:8118",
"http://183.32.88.244:808",
"http://121.40.42.35:9999",
"http://222.94.148.210:808"
]
def get_url(name,page):
from urllib.parse import quote
TB_name=quote(name)
TB_url=f"https://tieba.baidu.com/f?kw={TB_name}&ie=utf-8&pn={(page - 1) * 50}"
return TB_url
def get_html(url):
'''
获取页面源码
'''
Myheaders = {
"User-Agent": agents[random.randint(0, len(agents)-1)],
}
#Myheaders['http'] = proxy[random.randint(0, len(proxy))]
try:
r = requests.get(url,timeout=50)#headers属性使用不当会识别失败
r.raise_for_status() #用于抛出异常状态
r.encoding ='utf-8' #自动为r.apparent_encoding
return r.text
except:
print("获取页面失败")
return "获取页面失败"
def get_content(url):
'''
分析贴吧的网页文件,整理信息,保存在列表变量中
'''
comments = []
html = get_html(url)
soup = BeautifulSoup(html, 'lxml')
liTags = soup.find_all('li', attrs={'class': 'j_thread_list clearfix'})
if len(html)>=100 and len(liTags)==0:print("识别帖子失败")
if len(html)<100:print("访问失败")
for li in liTags:
comment = {}
try:
comment['标题'] = li.find('a', attrs={'class': 'j_th_tit'}).text.strip()
comment['链接'] = "http://tieba.baidu.com/" +li.find('a', attrs={'class': 'j_th_tit'})['href']
comment['楼主'] = li.find('span', attrs={'class': 'tb_icon_author'}).text.strip()
comment['创建时间'] = li.find('span', attrs={'class': 'pull-right is_show_create_time'}).text.strip()
comment['楼高'] = li.find('span', attrs={'class': 'threadlist_rep_num center_text'}).text.strip()
comments.append(comment)
except:
print('出了点小问题')
return comments
def myAlign(string,strnum=10,zhnum=10):
num1,num2=0,0
kong=""
for i in string:
if i >= u'\u4e00' and i <= u'\u9fa5':num1+=1
else:num2+=1
if zhnum-num1>0:kong+=u' '*(zhnum-num1)
if strnum-num2>0:kong+=' '*(strnum-num2)
return string+kong
def sortkey(date):
return int(date['楼高'])
def show_TB(name,page):
TB_url=get_url(name,page)
all=get_content(TB_url)
all.sort(key=sortkey,reverse=True)
print('\033[3;31m', end="")
for i in all:
print('————————————————————————————————————————————————————————————————————————————————')
print(myAlign("发帖时间"+i["创建时间"]),myAlign("发帖人:"+i['楼主']),myAlign("回复数:"+i["楼高"]))
print("\033[1;32;48m %s \033[3;31m" %i["标题"])
show_TB("李毅",1)
爬学校信息
import urllib.request
from bs4 import BeautifulSoup
import re,time
tianqi_headers = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
re_weather_range = re.compile(r"[^\u4e00-\u9fa5]+") # 温度范围
def get_url(page):
TB_url = f"http://jiuye.tyut.edu.cn/zpweb/zpzq.aspx?page={page}"
print(TB_url)
return TB_url
def get_content(url, header=tianqi_headers):
req = urllib.request.Request(url + "/7")
req.add_header("User-Agent", tianqi_headers)
req.add_header("GET", url)
content = urllib.request.urlopen(req).read()
return content.decode('utf-8')
def get_message(page):
url = get_url(page)
content=get_content(url)
soup1 = BeautifulSoup(content, 'lxml')
interviews = soup1.find('tr', attrs={'valign': 'top'})
offer = []
for interview in interviews.find_all('table'):
try:
tempoffer= {}
the_offer = interview.find_all('td')
tempoffer['公司'] = the_offer[1].get_text().strip()
tempoffer['职位'] = the_offer[2].get_text().strip()
tempoffer['专业'] = the_offer[3].get_text().strip()
tempoffer['学历'] = the_offer[4].get_text().strip()
tempoffer['时间'] = the_offer[5].get_text().strip()
offer.append(tempoffer)
except:
print('获取出错')
return offer
def myAlign(string, zhnum=5):
if len(string)>=zhnum:
return string[:zhnum]
else:
num=zhnum-len(string)
return string + u' ' *num
def showoffers(pages):
#for page in range(pages):
infor = get_message(pages)
print('\033[3;32m', end="")
for i in infor:
print("---------------------------------------------------"*2)
print("\033[1;31;48m" ,end="")
print(myAlign(i["时间"],15)+myAlign(i["专业"],15)+myAlign(i["学历"],5),end=" ")
print(myAlign(i["公司"],15) +myAlign(i["职位"],15))
print('\033[3;32m', end="")
showoffers(2)
个别代码解释
整体上获取网站,提取信息的部分没什么难得,直接CV就懂了,不过为了好看(虽然只是显示再控制台),我控制了颜色和字段间距
关于颜色:
print("\033[1;30m %s \033[3;30m" %" 30")
print("\033[1;31m %s \033[3;30m" %" 31")
print("\033[1;32m %s \033[3;30m" %" 32")
print("\033[1;33m %s \033[3;30m" %" 33")
print("\033[1;34m %s \033[3;30m" %" 34")
print("\033[1;35m %s \033[3;30m" %" 35")
print("\033[1;36m %s \033[3;30m" %" 36")
print("\033[1;37m %s \033[3;30m" %" 37")
print("\033[1;38m %s \033[3;30m" %" 38")
print("\033[1;39m %s \033[3;30m" %" 39")
print("\033[1;40m %s \033[3;30m" %" 40")
print("\033[1;41m %s \033[3;30m" %" 41")
print("\033[1;42m %s \033[3;30m" %" 42")
print("\033[1;43m %s \033[3;30m" %" 43")
print("\033[1;44m %s \033[3;30m" %" 44")
print("\033[1;45m %s \033[3;30m" %" 45")
print("\033[1;46m %s \033[3;30m" %" 46")
print("\033[1;47m %s \033[3;30m" %" 47")
print("\033[1;48m %s \033[3;30m" %" 48")
print("\033[1;49m %s \033[3;30m" %" 49")
print('\033[1;31;48m','*' * 50,'\n\033[7;31;40m 错误次数超限,用户已被永久锁定,请联系管理员! \033[1;31;48m\n','*' * 50,'\033[0m')
运行结果,自己对比就懂了
关于字段间距,中文和非中文占据的宽度不一样,中文空格和非中文空格所占的空格宽度也不一样,所以显示的往往不整齐,所以我先预设这一段的文字中占多少个中文和非中文宽度,少了就补上相应的空格。代码如下。
def myAlign(string,strnum=5,zhnum=5):#输入的字符串,预设字符数量,预设汉字数量
num1,num2=0,0#中文字符数量,非中文字符数量
kong=""
for i in string:#获得中文/非中文字符数量
if i >= u'\u4e00' and i <= u'\u9fa5':num1+=1
else:num2+=1
if zhnum-num1>0:kong+=u' '*(zhnum-num1)#如果字段中汉字少于预设的汉字数,就加上相应数量的汉字空格
if strnum-num2>0:kong+=' '*(strnum-num2)#如果字符段中的字符少于预设的字符,就加上相应数量的非中文空格
return string+kong #返回加上空格的字符串
总结
暂无 关注我的应该是系统机器人吧,/(ㄒoㄒ)/~~