中国天气:你们天天爬人家喔。
使用库 selenium,xpath,mongodb
一.无图无真相
省市县表
一天时段天气信息
二、流程数据分析
点击北京下面是北京的区,点击河北->石家庄->鹿泉。ok,是直辖市只有区,不是是省市县有三层。
用selenium模拟浏览器依次点点点,有人问为啥不用解析Html,看下图因为是js操作,我也没抓到接口。顺道使用selenium练习练习啊
有个selenium真的so easy,我觉得真的是太好用了。啥网页都能搞,只要我一个一个的模拟人点就行啦
1.找到输入框点击弹出下拉框
driver.find_element_by_id("txtZip").click()
2.获取页面所有省份名称
html.xpath("//dd[@id='searchCityList']/a/text()") # 获取省份集合
3.模拟点击省份,判断省份是否为直辖市, 是直辖市直接提取区链接。不是则点击省份获取市
driver.find_element_by_xpath("//dd[@id='cityList_city']/a[@title='{}']".format(city)).click() # 点击市
4.点击市获取县名称和链接
self.driver.find_element_by_xpath("//dd[@id='cityList_city']/a[@title='{}']".format(city)).click() # 点击市
5.获取县链接,用正则表达匹配是否为链接,因为有个别是js代码
# 获取城市连接
def get_link(self, str):
print("城市名称:", str)
html = etree.HTML(self.driver.page_source)
cityStr = "//a[@title='{}']/@href".format(str)
print("城市链接", cityStr)
citylinks = html.xpath(cityStr)
print("获取城市链接", citylinks)
for link in citylinks:
isLink = re.search(r'^http://www.\w+.*', link)
if isLink:
return link
6.存入mongodb数据库,请求连接使用xpath解析数据就行了,看看我们需要那些数据
times = html.xpath("//div[@class='time']/em/text()") # 时间
print(times)
wpics = html.xpath("//div[@class='wpic']/div/big/@title") # 天气
tems = html.xpath("//div[@id='curve']/div[@class='tem']/em/text()") # 温度
print(tems)
winfs = html.xpath("//div[@class='winf']/em/text()") # 风向
winls = html.xpath("//div[@class='winl']/em/text()") # 等级
三、代码
import asyncio
import re
import time
from lxml import etree
from selenium import webdriver
import pymongo
province_remove_list = ['北京', '上海', '重庆', '天津'] # 用于判断直辖市
mongo = pymongo.MongoClient(host='localhost', port=27017)
db = mongo['python'] # 获取数据库
weatherCollection = db.weather # 设置天气名称
linkCollection = db.link # 设置链接名称
weatherInfoCollection = db.weather_info # 设置天气名称
tasks = []
# mongodb数据库类(2)
class MongodbHelp:
def __init__(self):
global db, weatherCollection, linkCollection
self.weatherCollection = weatherCollection # 设置天气名称
self.linkCollection = linkCollection # 设置链接名称
self.spider = WeatherSpider()
self.loop = asyncio.get_event_loop() # 用于异步请求
# 插入省市县加跳转链接数据
def insert_data_weather(self, data):
result = self.weatherCollection.insert_many(data)
print(result)
# 插入具体城市天气信息
def insert_data_weather_info(self, data):
result = self.weatherCollection.insert_many(data)
# 查询存在的城市进行解析数据
def select_data_weather(self):
for x in self.weatherCollection.find(): # 查找数据库所有链接数据
# print(x)
province = x["province"] #获取省份名称
if province not in province_remove_list: # 判断是否为直辖市 直辖市不点击取连接 非直辖市点击读取
citys = x["citys"]#获取市名称
for city in citys:
countys = city["countys"]
for county in countys:
name = county["county"]#获取县名称
link = county["link"]#获取天气链接
print(name, link)
task = asyncio.ensure_future(self.spider.parse_html(link, name))
tasks.append(task)
else:#如果是直辖市直接取链接
link = x["link"]
name = province
print(name, link)
task = asyncio.ensure_future(self.spider.parse_html(link, name))
tasks.append(task)
#semaphore = asyncio.Semaphore(500) #
self.loop.run_until_complete(asyncio.wait(tasks))
def close(self):
global mongo
mongo.close()
# weather链接解析类(3)
class WeatherSpider:
def __init__(self):
async def parse_html(self, url, name):
list = []
# 用于添加无界面参数
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)
driver.get(url) #使用request请求具体数据没有返回,抓多了有反爬了,直接模拟浏览器获取
html = etree.HTML(driver.page_source)
times = html.xpath("//div[@class='time']/em/text()") # 时间
print(times)
wpics = html.xpath("//div[@class='wpic']/div/big/@title") # 天气
tems = html.xpath("//div[@id='curve']/div[@class='tem']/em/text()") # 温度
print(tems)
winfs = html.xpath("//div[@class='winf']/em/text()") # 风向
winls = html.xpath("//div[@class='winl']/em/text()") # 等级
for i in range(len(times)):
print(times[i], wpics[i], tems[i], winfs[i], winls[i])
info = {
"times": times[i],
"wpics": wpics[i],
"tems": tems[i],
"winfs": winfs[i],
"winls": winls[i]
}
list.append(info)
data = {
"address": name,
"weather": list
}
global weatherCollection
weatherInfoCollection.insert_one(data) # 存入数据库
driver.quit()
# 浏览器行为类(1)
class WeatherSelenium:
def __init__(self):
self.driver = webdriver.Chrome()
self.driver.get("http://www.weather.com.cn/")
self.citys = [] # 市
self.countys = [] # 县
self.help = MongodbHelp()
self.datas = []
def get_province(self):
self.driver.find_element_by_id("txtZip").click()#找到输入框点击
time.sleep(1)
provinces = self.get_name_list(1)
for province in provinces:
time.sleep(2)
self.get_city(province) # 传入省份点击
print("存入数据---》", self.datas)
self.help.insert_data_weather(self.datas)
self.citys.clear() # 清除数据
self.countys.clear() # 清除数据
self.datas.clear() # 清除数据
self.driver.quit()
def get_name_list(self, isProvince):
if isProvince:
html = etree.HTML(self.driver.page_source) # 解析html
provinces = html.xpath("//dd[@id='searchCityList']/a/text()") # 获取省份集合
return provinces
else:
html = etree.HTML(self.driver.page_source) # 解析html
provinces = html.xpath("//dd[@id='cityList_city']/a/text()") # 获取省份集合
return provinces
def get_city(self, province):
print("获取省份", province)
xpathStr = "//dd[@id='searchCityList']/a[@title='{}']"
if province not in province_remove_list: # 判断是否为直辖市 直辖市不点击取连接 非直辖市点击读取
self.driver.find_element_by_xpath(xpathStr.format(province)).click() # 点击省份
citys = self.get_name_list(0)
print(province, str(citys))
for city in citys: # 市
self.driver.find_element_by_xpath(
"//dd[@id='cityList_city']/a[@title='{}']".format(city)).click() # 点击市
countys = self.get_name_list(0) # 获取县名
time.sleep(1)
for county in countys:
link = self.get_link(county)
data = {
'county': county,
'link': link
}
print(link)
self.countys.append(data) # 添加县集合
data = {
'city': city,
'countys': self.countys
}
self.citys.append(data)
self.countys = [] # 使用[]置为空不是不是跟java中一样存进去就进去了,如果用clear上面保存的数据全部为空了
time.sleep(1)
self.driver.find_element_by_xpath("//span[@class='province-back']").click()
data = {
'province': province,
'citys': self.citys
}
self.datas.append(data)
self.driver.find_element_by_xpath("//span[@class='province-back']").click() #返回上一级列表
else:
link = self.get_link(province)
data = {
'province': province,
'link': link
}
self.datas.append(data)
# 获取城市连接
def get_link(self, str):
print("城市名称:", str)
html = etree.HTML(self.driver.page_source)
cityStr = "//a[@title='{}']/@href".format(str)
print("城市链接", cityStr)
citylinks = html.xpath(cityStr)
print("获取城市链接", citylinks)
for link in citylinks:
isLink = re.search(r'^http://www.\w+.*', link)
if isLink:
return link
if __name__ == '__main__':
# 抓取省市县数据
selenium = WeatherSelenium()
selenium.get_province()
# 查询省市县链接解析网页
help = MongodbHelp()
help.select_data_weather()
help.close()
四、总结
- 学习使用selenium模拟浏览器,遇到js执行可以暴力操作.. 缺点需要全部加载浪费资源。
- 使用sleep延迟等待网页加载完成,解析数据,否则会出现元素找不到。
- list的clear是内存清空之前保存的会全部清除,要用[]置空