研究Webdriver\json\re\BeatifulSoupAPI的具体脚本使用
#-*- coding:utf-8 -*-
#Filename:完整版搜狗脚本
#Author:Guan
#Datetime:2018/12/6
#The code is import package
import requests
from bs4 import BeautifulSoup
import json
import re
import time
from selenium import webdriver
import datetime
#The get_url function
def get_url(url):
# print(url)
return url
#The get_html function
def get_html(url):
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36",
"Cookie":"CXID=2C7D3DCAAA31333F0CA6B9F1D42B448E; SUID=07E1EB7C5B68860A5BEA43270009A690; ad=oujQFyllll2bf6GXlllllVs$yDolllllKnsPxZllllylllllRv7ll5@@@@@@@@@@; ABTEST=7|1543394206|v17; SUV=1543394206395727; browerV=3; osV=1; pgv_pvi=6458014720; SUIR=A2444FD8A4A1D8F49FA7A706A50AC1B8; IPLOC=CN1100; sct=205; PHPSESSID=0vtaku6mnueca0of31id4vf315; Hm_lvt_f5df380d5163c1cc4823c8d33ec5fa49=1544100309; Hm_lpvt_f5df380d5163c1cc4823c8d33ec5fa49=1544100309; taspeed=taspeedexist; sst0=606; ld=klllllllll2b@On0lllllVZJnG9lllllKnsPxZlllj6lllllxklll5@@@@@@@@@@; pgv_si=s6807446528; SNUID=F6299ACCF7F284AC18D7C7B8F8B242AD; seccodeRight=success; successCount=1|Thu, 06 Dec 2018 15:13:51 GMT"
}
response = requests.get(url=url,headers=headers).content.decode()
return response
#The get_cont function
def get_cont(html):
soup = BeautifulSoup(html,'lxml')
#The code is regulation_one
regulation_one = soup.select('div[style="width:548px"]')
cont_list = []
for i in regulation_one:
cont_dict = {}
cont_dict['url'] = get_url(url)
cont_dict['title'] = i.find_all('h3')[0].text.strip()
cont_dict['cont'] = i.find_all('p')[0].text.strip()
#The pattern code
pat = re.split(' - ', i.find_all('cite')[0].text.strip())
cont_dict['source'] = pat[0]
try:
reg = re.split('-', pat[1], maxsplit=1)
cont_dict['source'] = reg[0]
cont_dict['pub_date'] = reg[1].strip()
except Exception:
reg1 = re.split('-', pat[0], maxsplit=1)
cont_dict['source'] = reg1[0]
cont_dict['pub_date'] = reg1[1].strip()
cont_list.append(cont_dict)
# print(cont_list)
# #The code is regulation_two
regulation_two = soup.select('div[class="rb"]')
cont_list2 = []
for j in regulation_two:
cont_dict2 = {}
cont_dict2['url'] = get_url(url)
cont_dict2['title'] = j.find_all('h3')[0].text.strip()
cont_dict2['cont'] = j.select('div[class="ft"]')[0].text.strip()
#正则匹配
regulation = re.split(' - ', j.find_all('cite')[0].text.strip())
reg = re.split('-', regulation[0], maxsplit=1)
cont_dict2['source'] = reg[0]
try:
cont_dict2['pub_date'] = re.sub('翻译此页', '', reg[1]).strip()
except Exception:
reg1 = re.split('-', regulation[1], maxsplit=1)
cont_dict2['source'] = reg1[0]
cont_dict2['pub_date'] = re.sub('翻译此页', '', reg1[1]).strip()
cont_list2.append(cont_dict2)
#Write file now (Links to the local)
with open('D:\公司文件\.PyCharmCE2018.2\config\scratches\拓展\搜狗数据\搜狗本地数据\自动翻页数据存储','a',encoding='utf-8') as f:
#规则1数据解析
for regulation1 in cont_list:
str = json.dumps(regulation1,ensure_ascii=False)
print(str)
f.write(str+'\n')
#规则2数据解析
for regulation2 in cont_list2:
str2 = json.dumps(regulation2,ensure_ascii=False)
print(str2)
f.write(str2+'\n')
#The system main function
if __name__ == '__main__':
# url = get_url('http://www.sogou.com/web?query=%E7%91%9E%E8%99%8E8&page=12&ie=utf8')
# #
# # html = get_html(url)
# # get_cont(html)
#The code is reading cartype
file = open('D:\公司文件\.PyCharmCE2018.2\config\scratches\拓展\搜狗数据\搜狗配置车型',encoding='utf-8')
cont = file.readlines()
chexing =[]
for i in cont:
new_chexing = i.split(',')
for j in new_chexing:
chexing.append(j)
print(chexing)
# #The code is get new html_url
for cx in chexing:
url = get_url('http://www.sogou.com/web?query=%s&page=1&ie=utf8'%cx)
print('正在获取%s车型的数据'%cx)
html = get_html(url)
get_cont(html)
#The code is getting next_html
driver = webdriver.Chrome()
driver.get(url)
try:
while True:
time.sleep(3)
driver.find_element_by_xpath('//*[@id="sogou_next"]').click()
url = driver.current_url
get_url(url)
html = get_html(url)
get_cont(html)
except Exception:
print('获取下一页')
driver.quit()
print('数据爬去完成')