第二次尝试:
这次依然选用天气预报为例子(数据多)
但换成了国内的,API似乎有点简单
解析网页 以及 正则表达式 那里卡了很久
在看源码的过程中,我发现 网页广告 是如何插入网页的,其实就是一堆链接。。。。。
import re
import csv
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.support.select import Select
weather_list = []
def get_url():
url='http://www.weather.com.cn/weather/101270101.shtml'
#www.weather.com.cn/weather1d/101270101.shtml#input
return url
def get_urlText(url):
try:
kv={'user-agent' : 'Mozilla/5.0'}
r = requests.get(url,headers=kv)
r.raise_for_status()
r.encoding = r.apparent_encoding #使其encoding更准确
#print (r.text) #1
return r.text
except:
print('error 1')
return
def get_parseText(parse_url):
try:
soup = BeautifulSoup(parse_url,'html.parser')
lists=[ ]
lists = soup.find('ul','t clearfix').find_all('li')
for elem in lists:
date = elem.find('h1').get_text()
weather = elem.find('p','wea').get_text()
temperature = elem.find('p','tem').find('i').get_text()
win = re.findall('(?<= title=").*?(?=\")', str(elem.find('p', 'win').find('em')))
#*?匹配前面那个子表达式0/1次,最小匹配 ?= 捕获以title= 开头的内容 ?=查找“前面的。
wind = '-'.join(win)
# print(wind)
wind_lev=elem.find('p','win').find('i').get_text()
global weather_list
weather_list.append([date,weather,temperature,wind,wind_lev])
except:
print('error 2')
return
def prints(weather_list):
f = open('weatherlist', 'w',encoding='utf8')
tplt = '{0:^10}\t{1:^10}\t{2:^10}\t{3:^10}\t{4:^10}' #居中对齐
#print(tplt.format('日期','天气','温度','风向','风级',chr(12288)))
f.write(tplt.format('日期','天气','温度','风向','风级',chr(12288))) #以中文空格隔开
f.write('\n')
for elem in weather_list:
f.write(tplt.format(elem[0],elem[1],elem[2],elem[3],elem[4],chr(12288)))
#print(tplt.format(elem[0],elem[1],elem[2],elem[3],elem[4))
f.write('\n')
f.close()
def main():
url = get_url()
parse_text=get_urlText(url)
get_parseText(parse_text)
prints(weather_list)
main()
以下为源码
(不得不说500多行 可能不太熟练 找了很久每一天的天气为 <li class="sky skyid lv2 on">
<ul class="t clearfix">
<li class="sky skyid lv2 on">
<h1>4日(今天)</h1>
<big class="png40 d01"></big>
<big class="png40 n07"></big>
<p title="多云转小雨" class="wea">多云转小雨</p>
<p class="tem">
<span>16</span>/<i>9℃</i>
</p>
<p class="win">
<em>
<span title="无持续风向" class="NNW"></span>
<span title="无持续风向" class="NNW"></span>
</em>
<i><3级</i>
</p>
<div class="slid"></div>
</li>
<li class="sky skyid lv3">
<h1>5日(明天)</h1>
<big class="png40 d07"></big>
<big class="png40 n07"></big>
<p title="小雨" class="wea">小雨</p>
<p class="tem">
<span>15</span>/<i>9℃</i>
</p>
<p class="win">
<em>
<span title="无持续风向" class="NNW"></span>
<span title="无持续风向" class="NNW"></span>
</em>
<i><3级</i>
</p>
<div class="slid"></div>
</li>
<li class="sky skyid lv2">
<h1>6日(后天)</h1>
<big class="png40 d01"></big>
<big class="png40 n01"></big>
<p title="多云" class="wea">多云</p>
<p class="tem">
<span>15</span>/<i>9℃</i>
</p>
<p class="win">
<em>
<span title="无持续风向" class="NNW"></span>
<span title="无持续风向" class="NNW"></span>
</em>
<i><3级</i>
</p>
<div class="slid"></div>
</li>
<li class="sky skyid lv2">
<h1>7日(周六)</h1>
<big class="png40 d01"></big>
<big class="png40 n07"></big>
<p title="多云转小雨" class="wea">多云转小雨</p>
<p class="tem">
<span>17</span>/<i>11℃</i>
</p>
<p class="win">
<em>
<span title="无持续风向" class="NNW"></span>
<span title="无持续风向" class="NNW"></span>
</em>
<i><3级</i>
</p>
<div class="slid"></div>
</li>
<li class="sky skyid lv3">
<h1>8日(周日)</h1>
<big class="png40 d07"></big>
<big class="png40 n07"></big>
<p title="小雨" class="wea">小雨</p>
<p class="tem">
<span>15</span>/<i>9℃</i>
</p>
<p class="win">
<em>
<span title="无持续风向" class="NNW"></span>
<span title="无持续风向" class="NNW"></span>
</em>
<i><3级</i>
</p>
<div class="slid"></div>
</li>
<li class="sky skyid lv2">
<h1>9日(周一)</h1>
<big class="png40 d01"></big>
<big class="png40 n01"></big>
<p title="多云" class="wea">多云</p>
<p class="tem">
<span>14</span>/<i>6℃</i>
</p>
<p class="win">
<em>
<span title="无持续风向" class="NNW"></span>
<span title="无持续风向" class="NNW"></span>
</em>
<i><3级</i>
</p>
<div class="slid"></div>
</li>
<li class="sky skyid lv2">
<h1>10日(周二)</h1>
<big class="png40 d01"></big>
<big class="png40 n00"></big>
<p title="多云转晴" class="wea">多云转晴</p>
<p class="tem">
<span>15</span>/<i>3℃</i>
</p>
<p class="win">
<em>
<span title="无持续风向" class="NNW"></span>
<span title="无持续风向" class="NNW"></span>
</em>
<i><3级</i>
</p>
<div class="slid"></div>
</li>
</ul>
准备改进 :
用csv输入输出
将中国各个城市的代码写入文件中。
import re
import csv
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.support.select import Select
weather_list = []
def get_url():
url='http://www.weather.com.cn/weather/101270101.shtml'
#www.weather.com.cn/weather1d/101270101.shtml#input
return url
def get_urlText(url):
try:
kv={'user-agent' : 'Mozilla/5.0'}
r = requests.get(url,headers=kv)
r.raise_for_status()
r.encoding = r.apparent_encoding #使其encoding更准确
#print (r.text) #1
return r.text
except:
print('error 1')
return
def get_parseText(parse_url):
try:
soup = BeautifulSoup(parse_url,'html.parser')
lists=[ ]
lists = soup.find('ul','t clearfix').find_all('li')
for elem in lists:
date = elem.find('h1').get_text()
weather = elem.find('p','wea').get_text()
temperature = elem.find('p','tem').find('i').get_text()
win = re.findall('(?<= title=").*?(?=\")', str(elem.find('p', 'win').find('em')))
#*?匹配前面那个子表达式0/1次,最小匹配 ?= 捕获以title= 开头的内容 ?=查找“前面的。
wind = '-'.join(win)
# print(wind)
wind_lev=elem.find('p','win').find('i').get_text()
global weather_list
weather_list.append([date,weather,temperature,wind,wind_lev])
except:
print('error 2')
return
def prints(weather_list):
titles=['日期','天气','温度','风向','风级']
with open('weather.csv','w',encoding='utf8') as f:
f_csv = csv.writer(f)
f_csv.writerow(titles)
for row in weather_list:
f_csv.writerow(row)
def main():
url = get_url()
parse_text=get_urlText(url)
get_parseText(parse_text)
prints(weather_list)
main()