关于搜狗那点事


研究Webdriver\json\re\BeatifulSoupAPI的具体脚本使用

#-*- coding:utf-8 -*-
#Filename:完整版搜狗脚本
#Author:Guan
#Datetime:2018/12/6

#The code is import package
import requests
from bs4 import BeautifulSoup
import json
import re
import time
from selenium import webdriver
import datetime



#The get_url function
def get_url(url):
    # print(url)
    return url

#The get_html function
def get_html(url):
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36",
        "Cookie":"CXID=2C7D3DCAAA31333F0CA6B9F1D42B448E; SUID=07E1EB7C5B68860A5BEA43270009A690; ad=oujQFyllll2bf6GXlllllVs$yDolllllKnsPxZllllylllllRv7ll5@@@@@@@@@@; ABTEST=7|1543394206|v17; SUV=1543394206395727; browerV=3; osV=1; pgv_pvi=6458014720; SUIR=A2444FD8A4A1D8F49FA7A706A50AC1B8; IPLOC=CN1100; sct=205; PHPSESSID=0vtaku6mnueca0of31id4vf315; Hm_lvt_f5df380d5163c1cc4823c8d33ec5fa49=1544100309; Hm_lpvt_f5df380d5163c1cc4823c8d33ec5fa49=1544100309; taspeed=taspeedexist; sst0=606; ld=klllllllll2b@On0lllllVZJnG9lllllKnsPxZlllj6lllllxklll5@@@@@@@@@@; pgv_si=s6807446528; SNUID=F6299ACCF7F284AC18D7C7B8F8B242AD; seccodeRight=success; successCount=1|Thu, 06 Dec 2018 15:13:51 GMT"
    }
    response = requests.get(url=url,headers=headers).content.decode()
    return response

#The get_cont function
def get_cont(html):

    soup = BeautifulSoup(html,'lxml')

    #The code is regulation_one

    regulation_one = soup.select('div[style="width:548px"]')
    cont_list = []
    for i in regulation_one:
        cont_dict = {}
        cont_dict['url'] = get_url(url)
        cont_dict['title'] = i.find_all('h3')[0].text.strip()
        cont_dict['cont'] = i.find_all('p')[0].text.strip()

        #The pattern code

        pat = re.split(' - ', i.find_all('cite')[0].text.strip())
        cont_dict['source'] = pat[0]

        try:
            reg = re.split('-', pat[1], maxsplit=1)
            cont_dict['source'] = reg[0]
            cont_dict['pub_date'] = reg[1].strip()
        except Exception:
            reg1 = re.split('-', pat[0], maxsplit=1)
            cont_dict['source'] = reg1[0]
            cont_dict['pub_date'] = reg1[1].strip()

        cont_list.append(cont_dict)
    # print(cont_list)



    # #The code is regulation_two

    regulation_two = soup.select('div[class="rb"]')
    cont_list2 = []
    for j in regulation_two:
        cont_dict2 = {}
        cont_dict2['url'] = get_url(url)
        cont_dict2['title'] = j.find_all('h3')[0].text.strip()
        cont_dict2['cont'] = j.select('div[class="ft"]')[0].text.strip()
        #正则匹配
        regulation = re.split(' - ', j.find_all('cite')[0].text.strip())
        reg = re.split('-', regulation[0], maxsplit=1)
        cont_dict2['source'] = reg[0]

        try:
            cont_dict2['pub_date'] = re.sub('翻译此页', '', reg[1]).strip()
        except Exception:
            reg1 = re.split('-', regulation[1], maxsplit=1)
            cont_dict2['source'] = reg1[0]
            cont_dict2['pub_date'] = re.sub('翻译此页', '', reg1[1]).strip()
        cont_list2.append(cont_dict2)

    #Write file now (Links to the local)
    with open('D:\公司文件\.PyCharmCE2018.2\config\scratches\拓展\搜狗数据\搜狗本地数据\自动翻页数据存储','a',encoding='utf-8') as f:
        #规则1数据解析
        for regulation1 in cont_list:
            str = json.dumps(regulation1,ensure_ascii=False)
            print(str)
            f.write(str+'\n')
        #规则2数据解析
        for regulation2 in cont_list2:
            str2 = json.dumps(regulation2,ensure_ascii=False)
            print(str2)
            f.write(str2+'\n')





#The system main function
if __name__ == '__main__':
    # url = get_url('http://www.sogou.com/web?query=%E7%91%9E%E8%99%8E8&page=12&ie=utf8')
    #     #
    #     # html = get_html(url)
    #     # get_cont(html)


    #The code is reading cartype
    file = open('D:\公司文件\.PyCharmCE2018.2\config\scratches\拓展\搜狗数据\搜狗配置车型',encoding='utf-8')
    cont = file.readlines()
    chexing =[]
    for i in cont:
        new_chexing = i.split(',')
        for j in new_chexing:
            chexing.append(j)
    print(chexing)
    # #The code is get new html_url
    for cx in chexing:

        url = get_url('http://www.sogou.com/web?query=%s&page=1&ie=utf8'%cx)
        print('正在获取%s车型的数据'%cx)
        html = get_html(url)
        get_cont(html)

        #The code is getting next_html
        driver = webdriver.Chrome()
        driver.get(url)

        try:
            while True:
                time.sleep(3)
                driver.find_element_by_xpath('//*[@id="sogou_next"]').click()
                url = driver.current_url
                get_url(url)
                html = get_html(url)
                get_cont(html)
        except Exception:
            print('获取下一页')
        driver.quit()
    print('数据爬去完成')



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值