前言
这一次也是爬数据,也会把源码贴出来
一、爬去的数据
test4
共有4933条
二、爬虫的源码
from selenium import webdriver
import time
import warnings
import pandas as pd
import csv
warnings.filterwarnings("ignore")
driver=webdriver.Chrome(executable_path=r"C:\Users\dell\AppData\Local\Google\Chrome\Application\chromedriver.exe")
#driver.get("https://piao.qunar.com/ticket/list.htm?keyword=%E6%B3%B0%E5%AE%89®ion=&from=mpl_search_suggest")
driver.get("https://www.huaweicloud.com/pricing.html?tab=detail#/ecs")
time.sleep(3)
center_info=[]
Areas=driver.find_elements_by_xpath("/html/body/div[3]/div[1]/div/div[2]/div[2]/div/div/div[2]/div[1]/div/div/div[2]/div[1]/div/div/div[2]/div/div/div")
for Area in Areas:
Area.find_element_by_tag_name("button").click()
time.sleep(1)
area=Area.find_element_by_tag_name("button").find_element_by_tag_name("span").text
print(area)
#area=driver.find_element_by_class_name("page-calculator").find_element_by_class_name("ti-form-items").find_elements_by_class_name("ti-button-group")[0]\
#.find_element_by_class_name("ti-radio-button").find_element_by_tag_name("button").find_element_by_tag_name("span").text
cpus=Area.find_elements_by_xpath("/html/body/div[3]/div[1]/div/div[2]/div[2]/div/div/div[2]/div[1]/div/div/div[3]/div[2]/div/div[2]/div[1]/div/div/div[2]/div/div/div")
for i in cpus:
i.find_element_by_tag_name("button").click()
time.sleep(1)
cpu=i.find_element_by_tag_name("button").find_element_by_tag_name("span").text
print(cpu)
Standards=i.find_elements_by_xpath("/html/body/div[3]/div[1]/div/div[2]/div[2]/div/div/div[2]/div[1]/div/div/div[3]/div[2]/div/div[2]/div[2]/div/div/div[2]/div/div/div")
for Standard in Standards:
Standard.find_element_by_tag_name("button").click()
time.sleep(1)
standard=Standard.find_element_by_tag_name("button").find_element_by_tag_name("span").text
print(standard)
Types=Standard.find_elements_by_xpath("/html/body/div[3]/div[1]/div/div[2]/div[2]/div/div/div[2]/div[1]/div/div/div[3]/div[2]/div/div[2]/div[3]/div/div/div[2]/div/div/div")
for Type in Types:
Type.find_element_by_tag_name("button").click()
time.sleep(1)
type_name=Type.find_element_by_tag_name("button").find_element_by_tag_name("span").text
print(type_name)
Systems=Type.find_elements_by_xpath("/html/body/div[3]/div[1]/div/div[2]/div[2]/div/div/div[2]/div[1]/div/div/div[3]/div[2]/div/div[2]/div[4]/div/div/div[2]/div/div/div")
for System in Systems:
System.find_element_by_tag_name("button").click()
time.sleep(1)
system=System.find_element_by_tag_name("button").find_element_by_tag_name("span").text
print(system)
list=System.find_elements_by_xpath("/html/body/div[3]/div[1]/div/div[2]/div[2]/div/div/div[2]/div[1]/div/div/div[3]/div[2]/div/div[3]/div/div/div[3]/table/tbody/tr")
for tr in list:
info=[]
info.append(area)
info.append(cpu)
info.append(standard)
info.append(type_name)
info.append(system)
td=tr.find_elements_by_xpath("/html/body/div[3]/div[1]/div/div[2]/div[2]/div/div/div[2]/div[1]/div/div/div[3]/div[2]/div/div[3]/div/div/div[3]/table/tbody/tr[1]/td")
num= len(td)
standard_name = tr.find_elements_by_tag_name("td")[0].find_element_by_class_name("cell-normal").find_element_by_tag_name("span").text
info.append(standard_name)#规格名称
cpu_count = tr.find_elements_by_tag_name("td")[1].find_element_by_class_name("cell-normal").find_element_by_tag_name("span").text
info.append(cpu_count)#核数
memory = tr.find_elements_by_tag_name("td")[2].find_element_by_class_name("cell-normal").find_element_by_tag_name("span").text
info.append(memory)#内存
unit = tr.find_elements_by_tag_name("td")[-1].find_element_by_class_name("cell-normal").find_element_by_tag_name("span").text
info.append(unit)#单位
charge_hour = tr.find_elements_by_tag_name("td")[3].find_element_by_class_name("cell-normal").find_element_by_tag_name("span").text
if(charge_hour=="--"):
charge_hour="0"
info.append(charge_hour)#按小时
charge_month = tr.find_elements_by_tag_name("td")[4].find_element_by_class_name("cell-normal").find_element_by_tag_name("span").text
if(charge_month=="--"):
charge_month="0"
if(charge_month!="元"):
info.append(charge_month)
for j in range(5,num-1):
charge=tr.find_elements_by_tag_name("td")[j].find_element_by_class_name("cell").text
if(charge=="--"):
charge="0"
info.append(charge)
#charge_oneyear = tr.find_elements_by_tag_name("td")[5].find_element_by_class_name("cell").text
#charge_twoyear = tr.find_elements_by_tag_name("td")[6].find_element_by_class_name("cell").text
#charge_threeyear = tr.find_elements_by_tag_name("td")[7].find_element_by_class_name("cell").text
#charge_fouryear = tr.find_elements_by_tag_name("td")[8].find_element_by_class_name("cell").text
#charge_fiveyear = tr.find_elements_by_tag_name("td")[9].find_element_by_class_name("cell").text
print(info)
center_info.append(info)
driver.quit()
name=["区域","CPU架构","规格","类型","镜像","规格名称","核数","内存","价格单位","按小时","包月","包1年","包2年","包3年","包4年","包5年"]
test=pd.DataFrame(columns=name,data=center_info)
test.to_csv("D:/test0716.csv")