qichacha/知乎/国家统计局最新4级地区划/百度地图API获取经纬度/Google play app评论等分数据爬取

1、企查查数据抓取

1.1 关键公司LOGO 

# -*-coding:utf-8-*-

import pandas as pd
import requests
import json
import random
import time
from lxml import etree
import re

cookies = pd.read_csv('C:/Desktop/cookies.txt')
# cookies2 = pd.read_csv('C:/Desktop/cookies2.txt')


# cookie = 'acw_tc={}; QCCSESSID={}'.format(acw_tc, QCCSESSID)
# 'User-Agent': random.choice(uas)


def get_companies(key):
    idx = random.randint(0, cookies.shape[0] - 1)
    acw_tc = cookies.iloc[idx, 0]
    QCCSESSID = cookies.iloc[idx, 1]
    cookie = 'acw_tc={}; QCCSESSID={}'.format(acw_tc, QCCSESSID)
    headers = {
        'Cookie': cookie,
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36'}

    # proxy = {
    #     'http': pd.read_csv('E:/ip.txt', header=None, sep='\n', encoding='utf8').iloc[:, 0].to_list()[-1]}
    fnum = 0
    url = 'https://www.qcc.com/web/search?key={}'.format(key)
    try:
        # with requests.get(url, headers=self.headers, proxies=self.proxies) as response:
        response = requests.get(url, headers=headers)
        # response = requests.get(url, headers=self.headers)
        html = response.text
        parseHtml = etree.HTML(html)
        resNum = parseHtml.xpath('//div[@class="npanel-heading"]/h4/span[@class="text-danger"]/text()')
        resNum = int(resNum[0])
        if resNum > 0:
            print(resNum)
            imgUrl = parseHtml.xpath(
                '//div[@class="search-cell"]//tr[1]/td[@class="imgtd"]/div[@class="img"]/img/@src')
            print(imgUrl)
            return imgUrl

    except Exception as e:
        fnum = fnum + 1
        names.append(key)
        print('Connect fail')
        time.sleep(random.randint(10, 30))
        if fnum % 2 == 0:
            time.sleep(random.randint(10, 30))


def save_img(imgUrl, key):
    response = requests.get(imgUrl)
    filename = key + '.jpg'
    with open(filename, 'wb') as f:
        f.write(response.content)
    return key + 'Done'


if __name__ == "__main__":
    names = pd.read_excel('E:/爬取集团logo.xlsx').loc[:,'集团名称'].unique().tolist()
    # names = names[:2]
    total = 0
    count = 0
    with open('E:/groupimgUrl.txt', 'a', encoding='utf8') as fp:
        for i in names:
            total += 1
            key = i
            data= dict()
            imgUrl = get_companies(key)
            if imgUrl:
                data['key'] = key
                data['imgUrl'] = imgUrl
                data = json.dumps(data)
                fp.write(data + '\n')
                count += 1
                print(count)
                if count % 100 == 0:
                    time.sleep(random.randint(30, 60))
            else:
                print('NotFund')
            print('total=%s' % total)

1.2 获取股权穿透图

import pandas as pd
import requests
import time
import random
from lxml import etree
import json

cookies = pd.read_csv('C:/Desktop/cookies.txt')
cookies2 = pd.read_csv('C:/Desktop/cookies2.txt')


# 获取公司Id
def getCompanyId(key):
    idx = random.randint(0, cookies2.shape[0] - 1)
    acw_tc = cookies2.iloc[idx, 0]
    QCCSESSID = cookies2.iloc[idx, 1]
    cookie = 'acw_tc={}; QCCSESSID={}'.format(acw_tc, QCCSESSID)
    headers = {
        'Cookie': cookie,
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36'}

    ip = pd.read_csv('E:/ip.txt', header=None, sep='\n', encoding='utf8').iloc[:, 0].to_list()[-1]
    proxy = {
        'http': pd.read_csv('E:/ip.txt', header=None, sep='\n', encoding='utf8').iloc[:, 0].to_list()[-1]}
    fnum = 0
    url = 'https://www.qcc.com/web/search?key={}'.format(key)
    try:
        # with requests.get(url, headers=self.headers, proxies=self.proxies) as response:
        response = requests.get(url, headers=headers, proxies=proxy)
        # response = requests.get(url, headers=self.headers)
        html = response.text
        parseHtml = etree.HTML(html)
        resNum = parseHtml.xpath('//div[@class="npanel-heading"]/h4/span[@class="text-danger"]/text()')
        resNum = int(resNum[0])
        if resNum > 0:
            id = parseHtml.xpath(
                '//div[@class="maininfo"]/a[@class="title"]/@href')
            return id[0]
        else:
            #names.append(key)
            pass

    except Exception as e:
        fnum = fnum + 1
        #names.append(key)
        print('Connect fail')
        time.sleep(random.randint(10, 30))
        if fnum % 2 == 0:
            time.sleep(random.randint(10, 30))


def getCompanyGuQuan(key):
    idx = random.randint(0, cookies.shape[0] - 1)
    acw_tc = cookies.iloc[idx, 0]
    QCCSESSID = cookies.iloc[idx, 1]
    cookie = 'acw_tc={}; QCCSESSID={}'.format(acw_tc, QCCSESSID)
    headers = {
        'Cookie': cookie,
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36'}

    proxy = {
        'http': pd.read_csv('E:/ip.txt', header=None, sep='\n', encoding='utf8').iloc[:, 0].to_list()[-1]}
    fnum = 0
    url = "https://www.qcc.com/api/charts/getBeneficiaryData"
    postData = {'keyNo': key,"type": 2}
    try:
        res = requests.post(url=url, data=postData, headers=headers, proxies=proxy)
        if res.text:
            return res.text
        else:
            ids.append(id)
    except Exception as e:
        fnum = fnum + 1
        ids.append(id)
        print('Connect fail')
        time.sleep(random.randint(10, 30))
        if fnum % 2 == 0:
            time.sleep(random.randint(10, 30))


if __name__ == "__main__":
    # names = pd.read_excel('C:/Users/lele.ding/Desktop/test.xlsx').loc[:, 'keys'].unique().tolist()
    # ids = pd.read_excel('E:/companyIds.xlsx').loc[:,'id'].tolist()
    ids =['65765594ae6ec7793a279424e52c7dc2']
    # names = names[5222:]
    total = 0
    count = 0
    # with open('E:/companyIds.txt', 'a', encoding='utf8') as fp:
    #     for i in ids:
    #         total += 1
    #         key = i
    #         data = dict()
    #         id = getCompanyId(key)
    #         if id:
    #             data['key'] = key
    #             data['id'] = id
    #             data = json.dumps(data)
    #             fp.write(data + '\n')
    #             count += 1
    #             print(count)
    #             if count % 100 == 0:
    #                 time.sleep(random.randint(30, 60))
    #         else:
    #             print('NotFund')
    #         print('total=%s' % total)
    with open('E:/companyGuQuan.txt', 'a', encoding='utf8') as fp:
        for i in ids:
            total += 1
            key = i
            data = dict()
            id = getCompanyGuQuan(key)
            if id:
                data['key'] = key
                data['id'] = id
                data = json.dumps(data)
                fp.write(data + '\n')
                count += 1
                print(count)
                if count % 100 == 0:
                    time.sleep(random.randint(30, 60))
            else:
                print('NotFund')
            print('total=%s' % total)

1.3 VIP 集团相关数据

# -*-coding:utf-8-*-

import pandas as pd
import requests
import json
import random
import time
from lxml import etree

cookies = pd.read_csv('../cookies.txt')
# 'User-Agent': random.choice(uas)


def get_companies(key):
    idx = random.randint(0, cookies.shape[0]-1)
    acw_tc = cookies.iloc[idx, 0]
    QCCSESSID = cookies.iloc[idx, 1]
    cookie = 'acw_tc={}; QCCSESSID={}'.format(acw_tc, QCCSESSID)
    headers = {
        'Cookie': cookie,
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36'}

    # proxy = {
    #     'http':pd.read_csv('E:/ip.txt',header=None,sep='\n',encoding='utf8').iloc[:,0].to_list()[-1]}
    fnum = 0
    url = 'https://www.qcc.com/web/search?key={}'.format(key)
    try:
        # with requests.get(url, headers=self.headers, proxies=self.proxies) as response:
        response = requests.get(url, headers=headers)
        # response = requests.get(url, headers=self.headers)
        html = response.text
        parseHtml = etree.HTML(html)
        resNum = parseHtml.xpath('//div[@class="npanel-heading"]/h4/span[@class="text-danger"]/text()')
        resNum = int(resNum[0])
        if resNum > 1:
            comurl = parseHtml.xpath('//div[@class="maininfo"]/a[@class="title"]/@href')
            com = dict()
            com['comurl'] = comurl[0]
            return com
        else:
            com = dict()
            com['temp']= 'temp'
            return com
    except Exception as e:
        fnum = fnum + 1
        names.append(key)
        print('Connect fail')
        time.sleep(random.randint(10, 30))
        if fnum % 2 == 0:
            time.sleep(random.randint(10, 30))


def get_group(url):
    idx = random.randint(0, cookies.shape[0]-1)
    acw_tc = cookies.iloc[idx, 0]
    QCCSESSID = cookies.iloc[idx, 1]
    cookie = 'acw_tc={}; QCCSESSID={}'.format(acw_tc, QCCSESSID)
    headers = {
        'Cookie': cookie,
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.502 Safari/537.36'}
    # proxy = {
    #     'http': pd.read_csv('E:/ip.txt', header=None, sep='\n', encoding='utf8').iloc[:, 0].to_list()[-1]}
    fnum = 0
    data = dict()
    try:
        # with requests.get(url, headers=self.headers, proxies=self.proxies) as response:
        response = requests.get(url, headers=headers)
        # response = requests.get(url, headers=self.headers)
        html = response.text
        parseHtml = etree.HTML(html)
        temp = parseHtml.xpath(
            '//a[@class="oxin2"]//text()')
        print(temp)
        if temp:
            if temp[0] == "所属集团:" or temp[0] == "企业族群:":
                groupUrl = parseHtml.xpath(
                    '//a[@class="oxin2"][2]/@href')
                data['groupUrl'] = groupUrl
                data['label'] = 'Success'
        else:
            data['label'] = 'Nothing'
        return data
    except Exception as e:
        fnum = fnum + 1
        names.append(key)
        print('Connect fail')
        time.sleep(random.randint(10, 30))
        if fnum % 2 == 0:
            time.sleep(random.randint(10, 30))


if __name__ == "__main__":
    names = pd.read_excel('E:/典范雇主工商信息.xls').loc[:, '企业名称'].unique().tolist()
    # names = names[12348:]
    total = 0
    fnum = 0
    count = 0
    with open('E:/dianfan_groupName.txt', 'a', encoding='utf8') as fp:
        for i in names:
            total += 1
            key = i
            com = get_companies(key)
            if com:
                if com.get('comurl'):
                    data = get_group(com.get('comurl'))
                    if data:
                        data['key'] = key
                        data['groupUrl'] = com.get('groupUrl')
                        data = json.dumps(data)
                        fp.write(data + '\n')
                        count += 1
                        print(count)
                        if count % 100 == 0:
                            time.sleep(random.randint(30, 60))
                else:
                    print('NotFund')
            print('total=%s' % total)

2、Boss数据抓取

2.1 app端求职偏好设置问题爬取

import pandas as pd
import requests
import datetime
import json
import numpy as np

code_df = pd.read_csv('./workFuncCodeMap.txt')   #boss每个职能的code
codes = code_df['code'].unique().tolist()

d = datetime.datetime.now()
d = "%s%s%s" % (d.year, d.month, d.day)


def get_perInfo():
    headers = {'cookie': '_bl_uid=a9k10mag3he18U7jhybai70rIC7v; lastCity=101010100; __g=-; '
                         'Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1623835187; '
                         'wt2=D0Tcy2A_vz6-GkdeFUckL28Oa8gmlvcLmLMLn1bPrDV55Ic1yQwFY-4uU6Z9sIRcNldDJXsCLg4OaNLT6ZjznRw'
                         '~~; __l=l=%2Fwww.zhipin.com%2Fweb%2Fgeek%2Frecommend%3Frandom%3D1623835215696&r=&g=&s=3'
                         '&friend_source=0&s=3&friend_source=0; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1623835218; '
                         '__c=1623835187; __a=40920060.1623835187..1623835187.4.1.4.4; '
                         '__zp_stoken__=3638cKTB2PxgQK3wmdSxLGTEKbDZaVmo0CiUQM0p9Nw0SICRyW3p3YhhoJSJPVC0hfiBNSHcgLmUdZHVUG0FONE0LIDYCN1V8Uk46Jk0LNmJHYgMDEjFcXkABY1lvCidcTgNddV8gBhAYcjR0; geek_zp_token=V1R9MlGeD12VlgXdNrzBUbLi246TLVzQ~~',
               'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                             'Chrome/91.0.4472.77 Safari/537.36'}
    with open('./boss偏好_%s.txt' % d, 'a', encoding='utf8') as f:
        for k, i in enumerate(codes):
            url = "https://m.zhipin.com/wapi/zpgeek/trait/questions.json?position=%s&source=0" % i
           	"关键url"
            res = requests.get(url=url, headers=headers)
            s = i + '|' + res.text + '\n'
            f.write(s)
            print(k)


def get_detail(x):
    detail = pd.DataFrame()
    for i in x['questions']:
        te = pd.DataFrame(i.get('options'))
        te.rename(columns={'encryptId': 'options_encryptId'}, inplace=True)
        te['code'] = x['code']
        te['title'] = x['title']
        te['subTitle'] = x['subTitle']
        te['encryptId'] = i.get('encryptId')
        te['questionTitle'] = i.get('questionTitle')
        te['questionType'] = i.get('questionType')
        te['otherAnswer'] = i.get('otherAnswer')
        te['optionLimit'] = i.get('optionLimit')
        detail = pd.concat([detail, te])
    return detail


def clean_data():
    boss = pd.read_csv('./boss偏好_%s.txt' % d, header=None, sep='|')
    boss.columns = ['code', 'info']
    boss['info'] = boss['info'].apply(lambda x: json.loads(x))
    boss['zpData'] = boss['info'].apply(lambda x: x.get('zpData'))
    boss['questions'] = boss['zpData'].apply(lambda x: x.get('questions'))
    boss['title'] = boss['zpData'].apply(lambda x: x.get('title'))
    boss['subTitle'] = boss['zpData'].apply(lambda x: x.get('subTitle'))
    boss = boss.iloc[:, [0, 3, 4, 5]]
    boss = boss[boss['questions'].notna()]
    detail = boss.apply(get_detail, axis=1)
    bossPer = pd.concat(detail.values)
    bossPer = bossPer.iloc[:, :-2]
    bossPer = bossPer.loc[:, ['code', 'title', 'subTitle', 'encryptId', 'questionTitle', 'questionType',
                              'options_encryptId', 'jumpQuestionIds', 'content', 'chosen']]
    bossPer['code'] = bossPer['code'].map(str)
    bossPer = code_df.merge(bossPer, on='code')
    bossPer = bossPer.iloc[:, 1:]
    bossPer.drop(columns='code', inplace=True)
    bossPer.drop(columns='chosen', inplace=True)
    bossPer['jumpQuestionIds'] = bossPer[bossPer['jumpQuestionIds'].notna()]['jumpQuestionIds'].apply(
        lambda x: x.replace('[', '').replace(']', ''))
    bossPer['jumpQuestionIds'] = bossPer['jumpQuestionIds'].apply(lambda x: x if x else np.nan)
    bossPer.drop(columns='chosen', inplace=True)
    bossPer.drop_duplicates(inplace=True)
    bossPer.to_excel('./boss偏好_%s.xlsx' % d, index=None)


if __name__ == "__main__":
    get_perInfo()
    clean_data()

2.2 网页端左侧职能类目数据爬取

这个比较简单,不需要登录,直接requests模块请求即可。

3、知乎数据抓取

知乎数据抓取的难点在于js加密的解密获得关键中间id。

拼接后的字符串先经过md5加密后再经过js加密获得关键中间id。后再去请求url或则你想要的数据

下面以爬取院校专业及开设课程为例进行过程说明。

import json
import hashlib
from urllib import parse
import requests
import pandas as pd

college_ = pd.read_excel('E:/知乎/专业.xlsx')

def get_fmd5():
    fmds = pd.DataFrame()
    for kw in college_['专业名称'].to_list():
        data1 = {'': kw}
        data1 = parse.urlencode(data1)
        url = '/api/v4/search_v3?t=general&q' + data1 + '&correction=1&offset=0&limit=20&lc_idx=0&show_all_topics=0'
        use_url = 'https://www.zhihu.com' + url
        d_c = '"ADAbkyZHcBGPTtV_mkiBdB_YUceORLSGQFI=|1592372620"'   #cookies中的值
        to_encrypt = '101_3_2.0+{}+{}'.format(url, d_c)
        baseurl = 'https://www.zhihu.com'
        fmd5 = hashlib.md5(to_encrypt.encode()).hexdigest()
        use_url = baseurl + url
        res = pd.DataFrame([kw, fmd5, use_url]).T
        fmds = pd.concat([fmds, res])
    fmds.to_csv('../mastor',index=None)
    with open('../mastor.json', 'a') as f:
        f.write(json.dumps(fmds.iloc[:, 1].to_list()))

因为我的环境不知道为什么调用不起来js脚本就将md5加密后的str,让前端同事帮忙调用js加密脚本获得最终的加密字符串即id。js加密脚本如下:

// const jsdom = require("jsdom");
// const { JSDOM } = jsdom;
// const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`);
// window = dom.window;
// document = window.document;
// XMLHttpRequest = window.XMLHttpRequest;


var exports = {}

function t(e) {
    return (t = "function" == typeof Symbol && "symbol" == typeof Symbol.A ? function (e) {
        return typeof e
    } : function (e) {
        return e && "function" == typeof Symbol && e.constructor === Symbol && e !== Symbol.prototype ? "symbol" : typeof e
    })(e)
}

Object.defineProperty(exports, "__esModule", {value: !0});
var A = "2.0", __g = {};

function s() {
}

function i(e) {
    this.t = (2048 & e) >> 11, this.s = (1536 & e) >> 9, this.i = 511 & e, this.h = 511 & e
}

function h(e) {
    this.s = (3072 & e) >> 10, this.h = 1023 & e
}

function a(e) {
    this.a = (3072 & e) >> 10, this.c = (768 & e) >> 8, this.n = (192 & e) >> 6, this.t = 63 & e
}

function c(e) {
    this.s = e >> 10 & 3, this.i = 1023 & e
}

function n() {
}

function e(e) {
    this.a = (3072 & e) >> 10, this.c = (768 & e) >> 8, this.n = (192 & e) >> 6, this.t = 63 & e
}

function o(e) {
    this.h = (4095 & e) >> 2, this.t = 3 & e
}

function r(e) {
    this.s = e >> 10 & 3, this.i = e >> 2 & 255, this.t = 3 & e
}

s.prototype.e = function (e) {
    e.o = !1
}, i.prototype.e = function (e) {
    switch (this.t) {
        case 0:
            e.r[this.s] = this.i;
            break;
        case 1:
            e.r[this.s] = e.k[this.h]
    }
}, h.prototype.e = function (e) {
    e.k[this.h] = e.r[this.s]
}, a.prototype.e = function (e) {
    switch (this.t) {
        case 0:
            e.r[this.a] = e.r[this.c] + e.r[this.n];
            break;
        case 1:
            e.r[this.a] = e.r[this.c] - e.r[this.n];
            break;
        case 2:
            e.r[this.a] = e.r[this.c] * e.r[this.n];
            break;
        case 3:
            e.r[this.a] = e.r[this.c] / e.r[this.n];
            break;
        case 4:
            e.r[this.a] = e.r[this.c] % e.r[this.n];
            break;
        case 5:
            e.r[this.a] = e.r[this.c] == e.r[this.n];
            break;
        case 6:
            e.r[this.a] = e.r[this.c] >= e.r[this.n];
            break;
        case 7:
            e.r[this.a] = e.r[this.c] || e.r[this.n];
            break;
        case 8:
            e.r[this.a] = e.r[this.c] && e.r[this.n];
            break;
        case 9:
            e.r[this.a] = e.r[this.c] !== e.r[this.n];
            break;
        case 10:
            e.r[this.a] = t(e.r[this.c]);
            break;
        case 11:
            e.r[this.a] = e.r[this.c] in e.r[this.n];
            break;
        case 12:
            e.r[this.a] = e.r[this.c] > e.r[this.n];
            break;
        case 13:
            e.r[this.a] = -e.r[this.c];
            break;
        case 14:
            e.r[this.a] = e.r[this.c] < e.r[this.n];
            break;
        case 15:
            e.r[this.a] = e.r[this.c] & e.r[this.n];
            break;
        case 16:
            e.r[this.a] = e.r[this.c] ^ e.r[this.n];
            break;
        case 17:
            e.r[this.a] = e.r[this.c] << e.r[this.n];
            break;
        case 18:
            e.r[this.a] = e.r[this.c] >>> e.r[this.n];
            break;
        case 19:
            e.r[this.a] = e.r[this.c] | e.r[this.n];
            break;
        case 20:
            e.r[this.a] = !e.r[this.c]
    }
}, c.prototype.e = function (e) {
    e.Q.push(e.C), e.B.push(e.k), e.C = e.r[this.s], e.k = [];
    for (var t = 0; t < this.i; t++) e.k.unshift(e.f.pop());
    e.g.push(e.f), e.f = []
}, n.prototype.e = function (e) {
    e.C = e.Q.pop(), e.k = e.B.pop(), e.f = e.g.pop()
}, e.prototype.e = function (e) {
    switch (this.t) {
        case 0:
            e.u = e.r[this.a] >= e.r[this.c];
            break;
        case 1:
            e.u = e.r[this.a] <= e.r[this.c];
            break;
        case 2:
            e.u = e.r[this.a] > e.r[this.c];
            break;
        case 3:
            e.u = e.r[this.a] < e.r[this.c];
            break;
        case 4:
            e.u = e.r[this.a] == e.r[this.c];
            break;
        case 5:
            e.u = e.r[this.a] != e.r[this.c];
            break;
        case 6:
            e.u = e.r[this.a];
            break;
        case 7:
            e.u = !e.r[this.a]
    }
}, o.prototype.e = function (e) {
    switch (this.t) {
        case 0:
            e.C = this.h;
            break;
        case 1:
            e.u && (e.C = this.h);
            break;
        case 2:
            e.u || (e.C = this.h);
            break;
        case 3:
            e.C = this.h, e.w = null
    }
    e.u = !1
}, r.prototype.e = function (e) {
    switch (this.t) {
        case 0:
            for (var t = [], n = 0; n < this.i; n++) t.unshift(e.f.pop());
            e.r[3] = e.r[this.s](t[0], t[1]);
            break;
        case 1:
            for (var r = e.f.pop(), o = [], i = 0; i < this.i; i++) o.unshift(e.f.pop());
            e.r[3] = e.r[this.s][r](o[0], o[1]);
            break;
        case 2:
            for (var a = [], c = 0; c < this.i; c++) a.unshift(e.f.pop());
            e.r[3] = new e.r[this.s](a[0], a[1])
    }
};
var k = function (e) {
    for (var t = 66, n = [], r = 0; r < e.length; r++) {
        var o = 24 ^ e.charCodeAt(r) ^ t;
        n.push(String.fromCharCode(o)), t = o
    }
    return n.join("")
};

function Q(e) {
    this.t = (4095 & e) >> 10, this.s = (1023 & e) >> 8, this.i = 1023 & e, this.h = 63 & e
}

function C(e) {
    this.t = (4095 & e) >> 10, this.a = (1023 & e) >> 8, this.c = (255 & e) >> 6
}

function B(e) {
    this.s = (3072 & e) >> 10, this.h = 1023 & e
}

function f(e) {
    this.h = 4095 & e
}

function g(e) {
    this.s = (3072 & e) >> 10
}

function u(e) {
    this.h = 4095 & e
}

function w(e) {
    this.t = (3840 & e) >> 8, this.s = (192 & e) >> 6, this.i = 63 & e
}

function G() {
    this.r = [0, 0, 0, 0], this.C = 0, this.Q = [], this.k = [], this.B = [], this.f = [], this.g = [], this.u = !1, this.G = [], this.b = [], this.o = !1, this.w = null, this.U = null, this.F = [], this.R = 0, this.J = {
        0: s,
        1: i,
        2: h,
        3: a,
        4: c,
        5: n,
        6: e,
        7: o,
        8: r,
        9: Q,
        10: C,
        11: B,
        12: f,
        13: g,
        14: u,
        15: w
    }
}

Q.prototype.e = function (e) {
    switch (this.t) {
        case 0:
            e.f.push(e.r[this.s]);
            break;
        case 1:
            e.f.push(this.i);
            break;
        case 2:
            e.f.push(e.k[this.h]);
            break;
        case 3:
            e.f.push(k(e.b[this.h]))
    }
}, C.prototype.e = function (A) {
    switch (this.t) {
        case 0:
            var t = A.f.pop();
            A.r[this.a] = A.r[this.c][t];
            break;
        case 1:
            var s = A.f.pop(), i = A.f.pop();
            A.r[this.c][s] = i;
            break;
        case 2:
            var h = A.f.pop();
            A.r[this.a] = eval(h)
    }
}, B.prototype.e = function (e) {
    e.r[this.s] = k(e.b[this.h])
}, f.prototype.e = function (e) {
    e.w = this.h
}, g.prototype.e = function (e) {
    throw e.r[this.s]
}, u.prototype.e = function (e) {
    var t = this, n = [0];
    e.k.forEach(function (e) {
        n.push(e)
    });
    var r = function (r) {
        var o = new G;
        return o.k = n, o.k[0] = r, o.v(e.G, t.h, e.b, e.F), o.r[3]
    };
    r.toString = function () {
        return "() { [native code] }"
    }, e.r[3] = r
}, w.prototype.e = function (e) {
    switch (this.t) {
        case 0:
            for (var t = {}, n = 0; n < this.i; n++) {
                var r = e.f.pop();
                t[e.f.pop()] = r
            }
            e.r[this.s] = t;
            break;
        case 1:
            for (var o = [], i = 0; i < this.i; i++) o.unshift(e.f.pop());
            e.r[this.s] = o
    }
}, G.prototype.D = function (e) {
    console.log(window.atob(e));
    for (var t = window.atob(e), n = t.charCodeAt(0) << 8 | t.charCodeAt(1), r = [], o = 2; o < n + 2; o += 2) r.push(t.charCodeAt(o) << 8 | t.charCodeAt(o + 1));
    this.G = r;
    for (var i = [], a = n + 2; a < t.length;) {
        var c = t.charCodeAt(a) << 8 | t.charCodeAt(a + 1), s = t.slice(a + 2, a + 2 + c);
        i.push(s), a += c + 2
    }
    this.b = i
}, G.prototype.v = function (e, t, n) {
    for (t = t || 0, n = n || [], this.C = t, "string" == typeof e ? this.D(e) : (this.G = e, this.b = n), this.o = !0, this.R = Date.now(); this.o;) {
        var r = this.G[this.C++];
        if ("number" != typeof r) break;
        var o = Date.now();
        if (500 < o - this.R) return;
        this.R = o;
        try {
            this.e(r)
        } catch (e) {
            this.U = e, this.w && (this.C = this.w)
        }
    }
}, G.prototype.e = function (e) {
    var t = (61440 & e) >> 12;
    new this.J[t](e).e(this)
}, "undefined" != typeof window && (new G).v("AxjgB5MAnACoAJwBpAAAABAAIAKcAqgAMAq0AzRJZAZwUpwCqACQACACGAKcBKAAIAOcBagAIAQYAjAUGgKcBqFAuAc5hTSHZAZwqrAIGgA0QJEAJAAYAzAUGgOcCaFANRQ0R2QGcOKwChoANECRACQAsAuQABgDnAmgAJwMgAGcDYwFEAAzBmAGcSqwDhoANECRACQAGAKcD6AAGgKcEKFANEcYApwRoAAxB2AGcXKwEhoANECRACQAGAKcE6AAGgKcFKFANEdkBnGqsBUaADRAkQAkABgCnBagAGAGcdKwFxoANECRACQAGAKcGKAAYAZx+rAZGgA0QJEAJAAYA5waoABgBnIisBsaADRAkQAkABgCnBygABoCnB2hQDRHZAZyWrAeGgA0QJEAJAAYBJwfoAAwFGAGcoawIBoANECRACQAGAOQALAJkAAYBJwfgAlsBnK+sCEaADRAkQAkABgDkACwGpAAGAScH4AJbAZy9rAiGgA0QJEAJACwI5AAGAScH6AAkACcJKgAnCWgAJwmoACcJ4AFnA2MBRAAMw5gBnNasCgaADRAkQAkABgBEio0R5EAJAGwKSAFGACcKqAAEgM0RCQGGAYSATRFZAZzshgAtCs0QCQAGAYSAjRFZAZz1hgAtCw0QCQAEAAgB7AtIAgYAJwqoAASATRBJAkYCRIANEZkBnYqEAgaBxQBOYAoBxQEOYQ0giQKGAmQABgAnC6ABRgBGgo0UhD/MQ8zECALEAgaBxQBOYAoBxQEOYQ0gpEAJAoYARoKNFIQ/zEPkAAgChgLGgkUATmBkgAaAJwuhAUaCjdQFAg5kTSTJAsQCBoHFAE5gCgHFAQ5hDSCkQAkChgBGgo0UhD/MQ+QACAKGAsaCRQCOYGSABoAnC6EBRoKN1AUEDmRNJMkCxgFGgsUPzmPkgAaCJwvhAU0wCQFGAUaCxQGOZISPzZPkQAaCJwvhAU0wCQFGAUaCxQMOZISPzZPkQAaCJwvhAU0wCQFGAUaCxQSOZISPzZPkQAaCJwvhAU0wCQFGAkSAzRBJAlz/B4FUAAAAwUYIAAIBSITFQkTERwABi0GHxITAAAJLwMSGRsXHxMZAAk0Fw8HFh4NAwUABhU1EBceDwAENBcUEAAGNBkTGRcBAAFKAAkvHg4PKz4aEwIAAUsACDIVHB0QEQ4YAAsuAzs7AAoPKToKDgAHMx8SGQUvMQABSAALORoVGCQgERcCAxoACAU3ABEXAgMaAAsFGDcAERcCAxoUCgABSQAGOA8LGBsPAAYYLwsYGw8AAU4ABD8QHAUAAU8ABSkbCQ4BAAFMAAktCh8eDgMHCw8AAU0ADT4TGjQsGQMaFA0FHhkAFz4TGjQsGQMaFA0FHhk1NBkCHgUbGBEPAAFCABg9GgkjIAEmOgUHDQ8eFSU5DggJAwEcAwUAAUMAAUAAAUEADQEtFw0FBwtdWxQTGSAACBwrAxUPBR4ZAAkqGgUDAwMVEQ0ACC4DJD8eAx8RAAQ5GhUYAAFGAAAABjYRExELBAACWhgAAVoAQAg/PTw0NxcQPCQ5C3JZEBs9fkcnDRcUAXZia0Q4EhQgXHojMBY3MWVCNT0uDhMXcGQ7AUFPHigkQUwQFkhaAkEACjkTEQspNBMZPC0ABjkTEQsrLQ==");
var abc = function (e) {
    console.log(__g._encrypt(encodeURIComponent(e)))
    // return __g._encrypt(encodeURIComponent(e))
};

// function run(e) {
//     return __g._encrypt(encodeURIComponent(e))
// }

// module.exports = {run}

得到id后就可以愉快获取需要的信息拉。

# 计算机科学与技术为例,拼接必要信息获得md5加密调用js获得id,下面是test
test = 'a0F8o7L8r8Yf6RY88RY0eAHqFBFYeLtyKXO02Ae0F9SX'
use_url = 'https://www.zhihu.com/api/v4/search_v3?t=general&q=计算机科学与技术&correction=1&offset=0&limit=20&lc_idx=0&show_all_topics=0'
headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
        'cookie': 'd_c0=%s;' % (d_c),
        # 'referer': referer,
        'x-zse-93': '101_3_2.0',
        'x-zse-96': '2.0_%s' % test,  # js加密而来
    }

res = requests.get(use_url, headers=headers)

不过最近知乎已经没有原来的大学和专业开设院校、就业之类的信息了。。。。。

4、梧桐果数据抓取

scrapy框架多层调用url的方法,见下面7吧。

5、Google Play app评论爬取

软件:西柚(seeyou)

主要模块:python 3.6的requests

利用requests获得网页源码后,难点主要在于信息提取。

信息提取两种方式:

  • 文本转html,利用xpath提取(不适用于页面展示的源码和你请求的得到的源码不一致的情况,这次)
  • 利用正则提取(相对于xpath,难度会大一点)。

Google Play app评论中的

用户打分、评论数据在AF_initDataCallback中key:\'ds:25\'对应的data里(不同的app ds对应的值不一样,注意找下规律),定位好所需信息位置,利用正则提取。

import re,demjson
import pandas as pd

test = open('../google_facebook.txt',encoding='utf8').read()  # facebook的requests结果。
p = re.compile(r"AF_initDataCallback\({key: \'ds:25.+?, data:.+?, sideChannel: {}}\)",re.DOTALL)
tt = p.findall(test)
tt_dict = demjson.decode(tt[0].replace('AF_initDataCallback(','').replace(')',''))
test_res=[]
for i,v in enumerate(tt_dict ["data"][0]):
    d = dict()
    d['name'] = v[1][0]
    d['star'] = v[2]
    d['contents'] = v[4]
    test_res.append(d)
    
    
pd.DataFrame(test_res)

6、py调用百度地图API获得经纬度

  1. 获取调去API的秘钥ak:
    1. 注册成为开发者。
    2. 创建应用(浏览器端,允许访问IP填*)
    1. 具体步骤如下图

  1. 利用requests模块调用获得输入地址的经纬度。
import requests
import json


def getGeo(p):
    url = 'https://api.map.baidu.com/geocoding/v3/'
    params = {'address': p,  
              'ak': '2gDvpo219bUG91R8VXGK5rhXYg7GKg8o',  # 你创建的应用对应的百度密钥
              'output': 'json'}  # 输出结果设置为json格式
    res = requests.get(url, params)
    jd = json.loads(res.text)
    return jd

7、利用scrapy 获取国家统计局5级行政区划分

http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/

2020年数据:http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html

7.1 创建一个工程

scrapy startproject getGeoInfoFromStatsGov

cd getGeoInfoFromStatsGov

scrapy genspider geoInfo stats.gov.cn
  • -t:指定使用哪种模板,默认是“basic”

7.2 修改参数配置

7.2.1 pipelines

开始生成时,只有process_item这个函数,是用来接收item(主程序爬取return or yiled的数据)

为了写入文件,并减少文件的频繁打开与关闭,我们在process_item函数上下分别加入文件打开函数和文件关闭函数。

class GetgeoinfofromstatsgovPipeline:
    def open_spider(self, spider):
        # 打开文件
        self.fp = open('../geoInfo.txt', 'a', encoding='utf8')

    def process_item(self, item, spider):
        self.fp.write(str(item))      # 数据写入
        return item

    def close_spider(self, spider):
        # 关闭文件
        self.fp.close()

7.2.2 items

定义你的变量Field实例。

你yield or return到的变量都要定义。

7.2.3 settings

这里是进行一些常规设置:

是否遵循机器人协议(遵循了还爬个鬼)

放开pipelines的注释(也可以写多个pipelines,只要在settings中注册,并给定相应的权重,默认的是300,这个数值你可以随意给,只要能体现出彼此之间的大小就行。)

定义控制台输出的日志信息等级(也可以不定义,只是控制台输出的信息会很多,包括debug级别、warning级别等)

中间件注册

操作数据库相关

分布式爬虫相关

这里我只做了日志等级修改及输出

放开pipelines注释。

7.2.4 middlewares

定义自己的中间件是要先搞清楚网络请求的内部过程的。这属于太高阶,一般没写过。

要写中间件:

  • 首先在settings中放开中间件的注释,并把自己写的中间件注册(按已有的格式写好)
  • 中间件的一个添加随机UA的示例

settings中的修改

如果没记错的话,应该是数越小越先执行。

7.2.5 爬取数据存入数据库

我没存过,看网上写的比较简单,可自行百度。

7.3 写主程序爬虫代码

geoInfo.py就是你的主程序,scrapy genspider geoInfo就是生成的这个文件,注意该名称不可与你的爬虫程序同名。

7.3.1 一级url爬取

自然生成的geoInfo.py中就只有一个 parse函数。

这就会把国家统计局这张图中的各个省份的名字提取并传到pipelines里。

显而易见,每个省份下都链接他的下一级信息页面也即市级别的信息,在市级别的信息又链接下一级别的县层级信息。。。这就涉及多级url爬取

7.3.2 多级url爬取

yield就要callback下一个函数去解析下一级的信息,并将该层级的信息传递出去。

利用scrapy.Request()

参数:

  • url:就是下一层函数要请求的url
  • callback:就是你要调起的下一级解析函数的名字
  • meta:用来传递给下一级解析函数信息,字典的形式

具体代码如下

import scrapy
from ..items import GetgeoinfofromstatsgovItem


class GeoinfoSpider(scrapy.Spider):
    name = 'geoInfo'
    allowed_domains = ['stats.gov.cn']
    start_urls = [
        'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/']  # http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html

    def parse(self, response):
        firstLevels = response.xpath('//tr[@class="provincetr"]//td//a')
        for i in firstLevels:
            province = i.xpath('./text()').extract_first()
            p_url = i.xpath('./@href').extract_first()
            p_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/' + p_url
            # if province in ['新疆维吾尔自治区', '云南省', '安徽省', '黑龙江省', '山西省', '河北省', '宁夏回族自治区', '四川省', '广东省', '内蒙古自治区', '上海市',
            #                 '湖北省', '青海省', '广西壮族自治区', '辽宁省', '江苏省', '天津市', '北京市','甘肃省', '江西省', '福建省', '陕西省', '湖南省', '吉林省', '山东省','西藏自治区', '贵州省', '重庆市']:
            #     continue
            yield scrapy.Request(url= p_url, callback=self.secondParse,
                                 meta={'info': (province, p_url)})

    def secondParse(self, response):
        province, p_url = response.meta.get('info')
        print(2, province, p_url)
        cityLevels = response.xpath('//table[@class="citytable"]//tr[@class="citytr"]')
        for c in cityLevels:
            cityCode = c.xpath('./td[1]//text()').extract_first()
            cityName = c.xpath('./td[2]//text()').extract_first()
            cityUrl = c.xpath('./td[2]//@href').extract_first()
            c_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/' + cityUrl
            yield scrapy.Request(url=c_url, callback=self.thirdParse,
                                 meta={'info': (province, cityCode, cityName, cityUrl)})

    def thirdParse(self, response):
        province, cityCode, cityName, cityUrl = response.meta.get('info')
        cityUrl = cityUrl.strip().split('/')[0]
        countyLevels = response.xpath('//table[@class="countytable"]//tr[@class="countytr"]')
        for co in countyLevels:
            countyCode = co.xpath('./td[1]//text()').extract_first()
            countyName = co.xpath('./td[2]//text()').extract_first()
            countyUrl = co.xpath('./td[2]//@href').extract_first()
            if countyUrl:
                co_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/' + cityUrl + '/' + countyUrl
                yield scrapy.Request(url=co_url, callback=self.fourParse,
                                     meta={'info': (
                                         province, cityCode, cityName, countyCode, countyName, cityUrl, countyUrl)})
            else:
                geo = GetgeoinfofromstatsgovItem(province=province, cityCode=cityCode, cityName=cityName,
                                                 countyCode=countyCode, countyName=countyName)
                yield geo

    def fourParse(self, response):
        province, cityCode, cityName, countyCode, countyName, cityUrl, countyUrl = response.meta.get('info')
        countyUrl = countyUrl.strip().split('/')[0]
        townLevels = response.xpath('//table[@class="towntable"]//tr[@class="towntr"]')
        for t in townLevels:
            townCode = t.xpath('./td[1]//text()').extract_first()
            townName = t.xpath('./td[2]//text()').extract_first()
            townUrl = t.xpath('./td[2]//@href').extract_first()
            if townUrl:
                t_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/' + cityUrl + '/' + countyUrl + '/' + townUrl
                yield scrapy.Request(url=t_url, callback=self.fiveParse,
                                     meta={'info': (
                                         province, cityCode, cityName, countyCode, countyName, townCode, townName)})
            else:
                geo = GetgeoinfofromstatsgovItem(province=province, cityCode=cityCode, cityName=cityName,
                                                 countyCode=countyCode, countyName=countyName, townCode=townCode,
                                                 townName=townName)
                yield geo

    def fiveParse(self, response):
        province, cityCode, cityName, countyCode, countyName, townCode, townName = response.meta.get('info')
        villageLevels = response.xpath('//table[@class="villagetable"]//tr[@class="villagetr"]')
        for v in villageLevels:
            vCode = v.xpath('./td[1]/text()').extract_first()
            vName = v.xpath('./td[3]/text()').extract_first()
            geo = GetgeoinfofromstatsgovItem(province=province, cityCode=cityCode, cityName=cityName,
                                             countyCode=countyCode, countyName=countyName, townCode=townCode,
                                             townName=townName, vCode=vCode, vName=vName)
            yield geo

7.4 数据清洗

import demjson
import pandas as pd

with open('E:/programDll/geoInfoNew.txt', 'a', encoding='utf8') as fp:
    with open('E:/programDll/geoInfo.txt', encoding='utf8') as f:
        for l in f.readlines():
            l = l.strip()
            if len(l.split('}')) == 2:
                l1 = l.split('}')[0] + "}" + "\n"
                fp.write(l1)
                if l.split('}')[1]:
                    l2 = l.split('}')[1]
                    fp.write(l2)
                else:
                    pass
            elif l.split('{')[0] == "":
                fp.write(l)
            else:
                fp.write(l)

geoInfo = pd.read_csv('E:/programDll/geoInfoNew.txt', header=None, sep='\n')
geoInfo.columns = ['info']
# geoInfo['info'] = geoInfo['info'].apply(lambda x: x.replace(',{', '{').replace(",,", ","))
geoInfo['info'] = geoInfo['info'].apply(lambda x: demjson.decode(x))
geoInfo['vCode'] = geoInfo['info'].apply(lambda x: x.get('vCode'))
geoInfo['vName'] = geoInfo['info'].apply(lambda x: x.get('vName'))
geoInfo['townCode'] = geoInfo['info'].apply(lambda x: x.get('townCode'))
geoInfo['townName'] = geoInfo['info'].apply(lambda x: x.get('townName'))
geoInfo['countyCode'] = geoInfo['info'].apply(lambda x: x.get('countyCode'))
geoInfo['countyName'] = geoInfo['info'].apply(lambda x: x.get('countyName'))
geoInfo['cityCode'] = geoInfo['info'].apply(lambda x: x.get('cityCode'))
geoInfo['cityName'] = geoInfo['info'].apply(lambda x: x.get('cityName'))
geoInfo['province'] = geoInfo['info'].apply(lambda x: x.get('province'))

geoInfo = geoInfo.iloc[:, 1:]
geoInfo.drop_duplicates(inplace=True)
geoInfo.to_csv('E:/programDll/getGeoInfoFromStatsGov/2020年五级地理位置划分.txt', index=None)

数据示例

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值