1、企查查数据抓取
1.1 关键公司LOGO
# -*-coding:utf-8-*-
import pandas as pd
import requests
import json
import random
import time
from lxml import etree
import re
cookies = pd.read_csv('C:/Desktop/cookies.txt')
# cookies2 = pd.read_csv('C:/Desktop/cookies2.txt')
# cookie = 'acw_tc={}; QCCSESSID={}'.format(acw_tc, QCCSESSID)
# 'User-Agent': random.choice(uas)
def get_companies(key):
idx = random.randint(0, cookies.shape[0] - 1)
acw_tc = cookies.iloc[idx, 0]
QCCSESSID = cookies.iloc[idx, 1]
cookie = 'acw_tc={}; QCCSESSID={}'.format(acw_tc, QCCSESSID)
headers = {
'Cookie': cookie,
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36'}
# proxy = {
# 'http': pd.read_csv('E:/ip.txt', header=None, sep='\n', encoding='utf8').iloc[:, 0].to_list()[-1]}
fnum = 0
url = 'https://www.qcc.com/web/search?key={}'.format(key)
try:
# with requests.get(url, headers=self.headers, proxies=self.proxies) as response:
response = requests.get(url, headers=headers)
# response = requests.get(url, headers=self.headers)
html = response.text
parseHtml = etree.HTML(html)
resNum = parseHtml.xpath('//div[@class="npanel-heading"]/h4/span[@class="text-danger"]/text()')
resNum = int(resNum[0])
if resNum > 0:
print(resNum)
imgUrl = parseHtml.xpath(
'//div[@class="search-cell"]//tr[1]/td[@class="imgtd"]/div[@class="img"]/img/@src')
print(imgUrl)
return imgUrl
except Exception as e:
fnum = fnum + 1
names.append(key)
print('Connect fail')
time.sleep(random.randint(10, 30))
if fnum % 2 == 0:
time.sleep(random.randint(10, 30))
def save_img(imgUrl, key):
response = requests.get(imgUrl)
filename = key + '.jpg'
with open(filename, 'wb') as f:
f.write(response.content)
return key + 'Done'
if __name__ == "__main__":
names = pd.read_excel('E:/爬取集团logo.xlsx').loc[:,'集团名称'].unique().tolist()
# names = names[:2]
total = 0
count = 0
with open('E:/groupimgUrl.txt', 'a', encoding='utf8') as fp:
for i in names:
total += 1
key = i
data= dict()
imgUrl = get_companies(key)
if imgUrl:
data['key'] = key
data['imgUrl'] = imgUrl
data = json.dumps(data)
fp.write(data + '\n')
count += 1
print(count)
if count % 100 == 0:
time.sleep(random.randint(30, 60))
else:
print('NotFund')
print('total=%s' % total)
1.2 获取股权穿透图
import pandas as pd
import requests
import time
import random
from lxml import etree
import json
cookies = pd.read_csv('C:/Desktop/cookies.txt')
cookies2 = pd.read_csv('C:/Desktop/cookies2.txt')
# 获取公司Id
def getCompanyId(key):
idx = random.randint(0, cookies2.shape[0] - 1)
acw_tc = cookies2.iloc[idx, 0]
QCCSESSID = cookies2.iloc[idx, 1]
cookie = 'acw_tc={}; QCCSESSID={}'.format(acw_tc, QCCSESSID)
headers = {
'Cookie': cookie,
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36'}
ip = pd.read_csv('E:/ip.txt', header=None, sep='\n', encoding='utf8').iloc[:, 0].to_list()[-1]
proxy = {
'http': pd.read_csv('E:/ip.txt', header=None, sep='\n', encoding='utf8').iloc[:, 0].to_list()[-1]}
fnum = 0
url = 'https://www.qcc.com/web/search?key={}'.format(key)
try:
# with requests.get(url, headers=self.headers, proxies=self.proxies) as response:
response = requests.get(url, headers=headers, proxies=proxy)
# response = requests.get(url, headers=self.headers)
html = response.text
parseHtml = etree.HTML(html)
resNum = parseHtml.xpath('//div[@class="npanel-heading"]/h4/span[@class="text-danger"]/text()')
resNum = int(resNum[0])
if resNum > 0:
id = parseHtml.xpath(
'//div[@class="maininfo"]/a[@class="title"]/@href')
return id[0]
else:
#names.append(key)
pass
except Exception as e:
fnum = fnum + 1
#names.append(key)
print('Connect fail')
time.sleep(random.randint(10, 30))
if fnum % 2 == 0:
time.sleep(random.randint(10, 30))
def getCompanyGuQuan(key):
idx = random.randint(0, cookies.shape[0] - 1)
acw_tc = cookies.iloc[idx, 0]
QCCSESSID = cookies.iloc[idx, 1]
cookie = 'acw_tc={}; QCCSESSID={}'.format(acw_tc, QCCSESSID)
headers = {
'Cookie': cookie,
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36'}
proxy = {
'http': pd.read_csv('E:/ip.txt', header=None, sep='\n', encoding='utf8').iloc[:, 0].to_list()[-1]}
fnum = 0
url = "https://www.qcc.com/api/charts/getBeneficiaryData"
postData = {'keyNo': key,"type": 2}
try:
res = requests.post(url=url, data=postData, headers=headers, proxies=proxy)
if res.text:
return res.text
else:
ids.append(id)
except Exception as e:
fnum = fnum + 1
ids.append(id)
print('Connect fail')
time.sleep(random.randint(10, 30))
if fnum % 2 == 0:
time.sleep(random.randint(10, 30))
if __name__ == "__main__":
# names = pd.read_excel('C:/Users/lele.ding/Desktop/test.xlsx').loc[:, 'keys'].unique().tolist()
# ids = pd.read_excel('E:/companyIds.xlsx').loc[:,'id'].tolist()
ids =['65765594ae6ec7793a279424e52c7dc2']
# names = names[5222:]
total = 0
count = 0
# with open('E:/companyIds.txt', 'a', encoding='utf8') as fp:
# for i in ids:
# total += 1
# key = i
# data = dict()
# id = getCompanyId(key)
# if id:
# data['key'] = key
# data['id'] = id
# data = json.dumps(data)
# fp.write(data + '\n')
# count += 1
# print(count)
# if count % 100 == 0:
# time.sleep(random.randint(30, 60))
# else:
# print('NotFund')
# print('total=%s' % total)
with open('E:/companyGuQuan.txt', 'a', encoding='utf8') as fp:
for i in ids:
total += 1
key = i
data = dict()
id = getCompanyGuQuan(key)
if id:
data['key'] = key
data['id'] = id
data = json.dumps(data)
fp.write(data + '\n')
count += 1
print(count)
if count % 100 == 0:
time.sleep(random.randint(30, 60))
else:
print('NotFund')
print('total=%s' % total)
1.3 VIP 集团相关数据
# -*-coding:utf-8-*-
import pandas as pd
import requests
import json
import random
import time
from lxml import etree
cookies = pd.read_csv('../cookies.txt')
# 'User-Agent': random.choice(uas)
def get_companies(key):
idx = random.randint(0, cookies.shape[0]-1)
acw_tc = cookies.iloc[idx, 0]
QCCSESSID = cookies.iloc[idx, 1]
cookie = 'acw_tc={}; QCCSESSID={}'.format(acw_tc, QCCSESSID)
headers = {
'Cookie': cookie,
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36'}
# proxy = {
# 'http':pd.read_csv('E:/ip.txt',header=None,sep='\n',encoding='utf8').iloc[:,0].to_list()[-1]}
fnum = 0
url = 'https://www.qcc.com/web/search?key={}'.format(key)
try:
# with requests.get(url, headers=self.headers, proxies=self.proxies) as response:
response = requests.get(url, headers=headers)
# response = requests.get(url, headers=self.headers)
html = response.text
parseHtml = etree.HTML(html)
resNum = parseHtml.xpath('//div[@class="npanel-heading"]/h4/span[@class="text-danger"]/text()')
resNum = int(resNum[0])
if resNum > 1:
comurl = parseHtml.xpath('//div[@class="maininfo"]/a[@class="title"]/@href')
com = dict()
com['comurl'] = comurl[0]
return com
else:
com = dict()
com['temp']= 'temp'
return com
except Exception as e:
fnum = fnum + 1
names.append(key)
print('Connect fail')
time.sleep(random.randint(10, 30))
if fnum % 2 == 0:
time.sleep(random.randint(10, 30))
def get_group(url):
idx = random.randint(0, cookies.shape[0]-1)
acw_tc = cookies.iloc[idx, 0]
QCCSESSID = cookies.iloc[idx, 1]
cookie = 'acw_tc={}; QCCSESSID={}'.format(acw_tc, QCCSESSID)
headers = {
'Cookie': cookie,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.502 Safari/537.36'}
# proxy = {
# 'http': pd.read_csv('E:/ip.txt', header=None, sep='\n', encoding='utf8').iloc[:, 0].to_list()[-1]}
fnum = 0
data = dict()
try:
# with requests.get(url, headers=self.headers, proxies=self.proxies) as response:
response = requests.get(url, headers=headers)
# response = requests.get(url, headers=self.headers)
html = response.text
parseHtml = etree.HTML(html)
temp = parseHtml.xpath(
'//a[@class="oxin2"]//text()')
print(temp)
if temp:
if temp[0] == "所属集团:" or temp[0] == "企业族群:":
groupUrl = parseHtml.xpath(
'//a[@class="oxin2"][2]/@href')
data['groupUrl'] = groupUrl
data['label'] = 'Success'
else:
data['label'] = 'Nothing'
return data
except Exception as e:
fnum = fnum + 1
names.append(key)
print('Connect fail')
time.sleep(random.randint(10, 30))
if fnum % 2 == 0:
time.sleep(random.randint(10, 30))
if __name__ == "__main__":
names = pd.read_excel('E:/典范雇主工商信息.xls').loc[:, '企业名称'].unique().tolist()
# names = names[12348:]
total = 0
fnum = 0
count = 0
with open('E:/dianfan_groupName.txt', 'a', encoding='utf8') as fp:
for i in names:
total += 1
key = i
com = get_companies(key)
if com:
if com.get('comurl'):
data = get_group(com.get('comurl'))
if data:
data['key'] = key
data['groupUrl'] = com.get('groupUrl')
data = json.dumps(data)
fp.write(data + '\n')
count += 1
print(count)
if count % 100 == 0:
time.sleep(random.randint(30, 60))
else:
print('NotFund')
print('total=%s' % total)
2、Boss数据抓取
2.1 app端求职偏好设置问题爬取
import pandas as pd
import requests
import datetime
import json
import numpy as np
code_df = pd.read_csv('./workFuncCodeMap.txt') #boss每个职能的code
codes = code_df['code'].unique().tolist()
d = datetime.datetime.now()
d = "%s%s%s" % (d.year, d.month, d.day)
def get_perInfo():
headers = {'cookie': '_bl_uid=a9k10mag3he18U7jhybai70rIC7v; lastCity=101010100; __g=-; '
'Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1623835187; '
'wt2=D0Tcy2A_vz6-GkdeFUckL28Oa8gmlvcLmLMLn1bPrDV55Ic1yQwFY-4uU6Z9sIRcNldDJXsCLg4OaNLT6ZjznRw'
'~~; __l=l=%2Fwww.zhipin.com%2Fweb%2Fgeek%2Frecommend%3Frandom%3D1623835215696&r=&g=&s=3'
'&friend_source=0&s=3&friend_source=0; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1623835218; '
'__c=1623835187; __a=40920060.1623835187..1623835187.4.1.4.4; '
'__zp_stoken__=3638cKTB2PxgQK3wmdSxLGTEKbDZaVmo0CiUQM0p9Nw0SICRyW3p3YhhoJSJPVC0hfiBNSHcgLmUdZHVUG0FONE0LIDYCN1V8Uk46Jk0LNmJHYgMDEjFcXkABY1lvCidcTgNddV8gBhAYcjR0; geek_zp_token=V1R9MlGeD12VlgXdNrzBUbLi246TLVzQ~~',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/91.0.4472.77 Safari/537.36'}
with open('./boss偏好_%s.txt' % d, 'a', encoding='utf8') as f:
for k, i in enumerate(codes):
url = "https://m.zhipin.com/wapi/zpgeek/trait/questions.json?position=%s&source=0" % i
"关键url"
res = requests.get(url=url, headers=headers)
s = i + '|' + res.text + '\n'
f.write(s)
print(k)
def get_detail(x):
detail = pd.DataFrame()
for i in x['questions']:
te = pd.DataFrame(i.get('options'))
te.rename(columns={'encryptId': 'options_encryptId'}, inplace=True)
te['code'] = x['code']
te['title'] = x['title']
te['subTitle'] = x['subTitle']
te['encryptId'] = i.get('encryptId')
te['questionTitle'] = i.get('questionTitle')
te['questionType'] = i.get('questionType')
te['otherAnswer'] = i.get('otherAnswer')
te['optionLimit'] = i.get('optionLimit')
detail = pd.concat([detail, te])
return detail
def clean_data():
boss = pd.read_csv('./boss偏好_%s.txt' % d, header=None, sep='|')
boss.columns = ['code', 'info']
boss['info'] = boss['info'].apply(lambda x: json.loads(x))
boss['zpData'] = boss['info'].apply(lambda x: x.get('zpData'))
boss['questions'] = boss['zpData'].apply(lambda x: x.get('questions'))
boss['title'] = boss['zpData'].apply(lambda x: x.get('title'))
boss['subTitle'] = boss['zpData'].apply(lambda x: x.get('subTitle'))
boss = boss.iloc[:, [0, 3, 4, 5]]
boss = boss[boss['questions'].notna()]
detail = boss.apply(get_detail, axis=1)
bossPer = pd.concat(detail.values)
bossPer = bossPer.iloc[:, :-2]
bossPer = bossPer.loc[:, ['code', 'title', 'subTitle', 'encryptId', 'questionTitle', 'questionType',
'options_encryptId', 'jumpQuestionIds', 'content', 'chosen']]
bossPer['code'] = bossPer['code'].map(str)
bossPer = code_df.merge(bossPer, on='code')
bossPer = bossPer.iloc[:, 1:]
bossPer.drop(columns='code', inplace=True)
bossPer.drop(columns='chosen', inplace=True)
bossPer['jumpQuestionIds'] = bossPer[bossPer['jumpQuestionIds'].notna()]['jumpQuestionIds'].apply(
lambda x: x.replace('[', '').replace(']', ''))
bossPer['jumpQuestionIds'] = bossPer['jumpQuestionIds'].apply(lambda x: x if x else np.nan)
bossPer.drop(columns='chosen', inplace=True)
bossPer.drop_duplicates(inplace=True)
bossPer.to_excel('./boss偏好_%s.xlsx' % d, index=None)
if __name__ == "__main__":
get_perInfo()
clean_data()
2.2 网页端左侧职能类目数据爬取
这个比较简单,不需要登录,直接requests模块请求即可。
3、知乎数据抓取
知乎数据抓取的难点在于js加密的解密获得关键中间id。
拼接后的字符串先经过md5加密后再经过js加密获得关键中间id。后再去请求url或则你想要的数据
下面以爬取院校专业及开设课程为例进行过程说明。
import json
import hashlib
from urllib import parse
import requests
import pandas as pd
college_ = pd.read_excel('E:/知乎/专业.xlsx')
def get_fmd5():
fmds = pd.DataFrame()
for kw in college_['专业名称'].to_list():
data1 = {'': kw}
data1 = parse.urlencode(data1)
url = '/api/v4/search_v3?t=general&q' + data1 + '&correction=1&offset=0&limit=20&lc_idx=0&show_all_topics=0'
use_url = 'https://www.zhihu.com' + url
d_c = '"ADAbkyZHcBGPTtV_mkiBdB_YUceORLSGQFI=|1592372620"' #cookies中的值
to_encrypt = '101_3_2.0+{}+{}'.format(url, d_c)
baseurl = 'https://www.zhihu.com'
fmd5 = hashlib.md5(to_encrypt.encode()).hexdigest()
use_url = baseurl + url
res = pd.DataFrame([kw, fmd5, use_url]).T
fmds = pd.concat([fmds, res])
fmds.to_csv('../mastor',index=None)
with open('../mastor.json', 'a') as f:
f.write(json.dumps(fmds.iloc[:, 1].to_list()))
因为我的环境不知道为什么调用不起来js脚本就将md5加密后的str,让前端同事帮忙调用js加密脚本获得最终的加密字符串即id。js加密脚本如下:
// const jsdom = require("jsdom");
// const { JSDOM } = jsdom;
// const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`);
// window = dom.window;
// document = window.document;
// XMLHttpRequest = window.XMLHttpRequest;
var exports = {}
function t(e) {
return (t = "function" == typeof Symbol && "symbol" == typeof Symbol.A ? function (e) {
return typeof e
} : function (e) {
return e && "function" == typeof Symbol && e.constructor === Symbol && e !== Symbol.prototype ? "symbol" : typeof e
})(e)
}
Object.defineProperty(exports, "__esModule", {value: !0});
var A = "2.0", __g = {};
function s() {
}
function i(e) {
this.t = (2048 & e) >> 11, this.s = (1536 & e) >> 9, this.i = 511 & e, this.h = 511 & e
}
function h(e) {
this.s = (3072 & e) >> 10, this.h = 1023 & e
}
function a(e) {
this.a = (3072 & e) >> 10, this.c = (768 & e) >> 8, this.n = (192 & e) >> 6, this.t = 63 & e
}
function c(e) {
this.s = e >> 10 & 3, this.i = 1023 & e
}
function n() {
}
function e(e) {
this.a = (3072 & e) >> 10, this.c = (768 & e) >> 8, this.n = (192 & e) >> 6, this.t = 63 & e
}
function o(e) {
this.h = (4095 & e) >> 2, this.t = 3 & e
}
function r(e) {
this.s = e >> 10 & 3, this.i = e >> 2 & 255, this.t = 3 & e
}
s.prototype.e = function (e) {
e.o = !1
}, i.prototype.e = function (e) {
switch (this.t) {
case 0:
e.r[this.s] = this.i;
break;
case 1:
e.r[this.s] = e.k[this.h]
}
}, h.prototype.e = function (e) {
e.k[this.h] = e.r[this.s]
}, a.prototype.e = function (e) {
switch (this.t) {
case 0:
e.r[this.a] = e.r[this.c] + e.r[this.n];
break;
case 1:
e.r[this.a] = e.r[this.c] - e.r[this.n];
break;
case 2:
e.r[this.a] = e.r[this.c] * e.r[this.n];
break;
case 3:
e.r[this.a] = e.r[this.c] / e.r[this.n];
break;
case 4:
e.r[this.a] = e.r[this.c] % e.r[this.n];
break;
case 5:
e.r[this.a] = e.r[this.c] == e.r[this.n];
break;
case 6:
e.r[this.a] = e.r[this.c] >= e.r[this.n];
break;
case 7:
e.r[this.a] = e.r[this.c] || e.r[this.n];
break;
case 8:
e.r[this.a] = e.r[this.c] && e.r[this.n];
break;
case 9:
e.r[this.a] = e.r[this.c] !== e.r[this.n];
break;
case 10:
e.r[this.a] = t(e.r[this.c]);
break;
case 11:
e.r[this.a] = e.r[this.c] in e.r[this.n];
break;
case 12:
e.r[this.a] = e.r[this.c] > e.r[this.n];
break;
case 13:
e.r[this.a] = -e.r[this.c];
break;
case 14:
e.r[this.a] = e.r[this.c] < e.r[this.n];
break;
case 15:
e.r[this.a] = e.r[this.c] & e.r[this.n];
break;
case 16:
e.r[this.a] = e.r[this.c] ^ e.r[this.n];
break;
case 17:
e.r[this.a] = e.r[this.c] << e.r[this.n];
break;
case 18:
e.r[this.a] = e.r[this.c] >>> e.r[this.n];
break;
case 19:
e.r[this.a] = e.r[this.c] | e.r[this.n];
break;
case 20:
e.r[this.a] = !e.r[this.c]
}
}, c.prototype.e = function (e) {
e.Q.push(e.C), e.B.push(e.k), e.C = e.r[this.s], e.k = [];
for (var t = 0; t < this.i; t++) e.k.unshift(e.f.pop());
e.g.push(e.f), e.f = []
}, n.prototype.e = function (e) {
e.C = e.Q.pop(), e.k = e.B.pop(), e.f = e.g.pop()
}, e.prototype.e = function (e) {
switch (this.t) {
case 0:
e.u = e.r[this.a] >= e.r[this.c];
break;
case 1:
e.u = e.r[this.a] <= e.r[this.c];
break;
case 2:
e.u = e.r[this.a] > e.r[this.c];
break;
case 3:
e.u = e.r[this.a] < e.r[this.c];
break;
case 4:
e.u = e.r[this.a] == e.r[this.c];
break;
case 5:
e.u = e.r[this.a] != e.r[this.c];
break;
case 6:
e.u = e.r[this.a];
break;
case 7:
e.u = !e.r[this.a]
}
}, o.prototype.e = function (e) {
switch (this.t) {
case 0:
e.C = this.h;
break;
case 1:
e.u && (e.C = this.h);
break;
case 2:
e.u || (e.C = this.h);
break;
case 3:
e.C = this.h, e.w = null
}
e.u = !1
}, r.prototype.e = function (e) {
switch (this.t) {
case 0:
for (var t = [], n = 0; n < this.i; n++) t.unshift(e.f.pop());
e.r[3] = e.r[this.s](t[0], t[1]);
break;
case 1:
for (var r = e.f.pop(), o = [], i = 0; i < this.i; i++) o.unshift(e.f.pop());
e.r[3] = e.r[this.s][r](o[0], o[1]);
break;
case 2:
for (var a = [], c = 0; c < this.i; c++) a.unshift(e.f.pop());
e.r[3] = new e.r[this.s](a[0], a[1])
}
};
var k = function (e) {
for (var t = 66, n = [], r = 0; r < e.length; r++) {
var o = 24 ^ e.charCodeAt(r) ^ t;
n.push(String.fromCharCode(o)), t = o
}
return n.join("")
};
function Q(e) {
this.t = (4095 & e) >> 10, this.s = (1023 & e) >> 8, this.i = 1023 & e, this.h = 63 & e
}
function C(e) {
this.t = (4095 & e) >> 10, this.a = (1023 & e) >> 8, this.c = (255 & e) >> 6
}
function B(e) {
this.s = (3072 & e) >> 10, this.h = 1023 & e
}
function f(e) {
this.h = 4095 & e
}
function g(e) {
this.s = (3072 & e) >> 10
}
function u(e) {
this.h = 4095 & e
}
function w(e) {
this.t = (3840 & e) >> 8, this.s = (192 & e) >> 6, this.i = 63 & e
}
function G() {
this.r = [0, 0, 0, 0], this.C = 0, this.Q = [], this.k = [], this.B = [], this.f = [], this.g = [], this.u = !1, this.G = [], this.b = [], this.o = !1, this.w = null, this.U = null, this.F = [], this.R = 0, this.J = {
0: s,
1: i,
2: h,
3: a,
4: c,
5: n,
6: e,
7: o,
8: r,
9: Q,
10: C,
11: B,
12: f,
13: g,
14: u,
15: w
}
}
Q.prototype.e = function (e) {
switch (this.t) {
case 0:
e.f.push(e.r[this.s]);
break;
case 1:
e.f.push(this.i);
break;
case 2:
e.f.push(e.k[this.h]);
break;
case 3:
e.f.push(k(e.b[this.h]))
}
}, C.prototype.e = function (A) {
switch (this.t) {
case 0:
var t = A.f.pop();
A.r[this.a] = A.r[this.c][t];
break;
case 1:
var s = A.f.pop(), i = A.f.pop();
A.r[this.c][s] = i;
break;
case 2:
var h = A.f.pop();
A.r[this.a] = eval(h)
}
}, B.prototype.e = function (e) {
e.r[this.s] = k(e.b[this.h])
}, f.prototype.e = function (e) {
e.w = this.h
}, g.prototype.e = function (e) {
throw e.r[this.s]
}, u.prototype.e = function (e) {
var t = this, n = [0];
e.k.forEach(function (e) {
n.push(e)
});
var r = function (r) {
var o = new G;
return o.k = n, o.k[0] = r, o.v(e.G, t.h, e.b, e.F), o.r[3]
};
r.toString = function () {
return "() { [native code] }"
}, e.r[3] = r
}, w.prototype.e = function (e) {
switch (this.t) {
case 0:
for (var t = {}, n = 0; n < this.i; n++) {
var r = e.f.pop();
t[e.f.pop()] = r
}
e.r[this.s] = t;
break;
case 1:
for (var o = [], i = 0; i < this.i; i++) o.unshift(e.f.pop());
e.r[this.s] = o
}
}, G.prototype.D = function (e) {
console.log(window.atob(e));
for (var t = window.atob(e), n = t.charCodeAt(0) << 8 | t.charCodeAt(1), r = [], o = 2; o < n + 2; o += 2) r.push(t.charCodeAt(o) << 8 | t.charCodeAt(o + 1));
this.G = r;
for (var i = [], a = n + 2; a < t.length;) {
var c = t.charCodeAt(a) << 8 | t.charCodeAt(a + 1), s = t.slice(a + 2, a + 2 + c);
i.push(s), a += c + 2
}
this.b = i
}, G.prototype.v = function (e, t, n) {
for (t = t || 0, n = n || [], this.C = t, "string" == typeof e ? this.D(e) : (this.G = e, this.b = n), this.o = !0, this.R = Date.now(); this.o;) {
var r = this.G[this.C++];
if ("number" != typeof r) break;
var o = Date.now();
if (500 < o - this.R) return;
this.R = o;
try {
this.e(r)
} catch (e) {
this.U = e, this.w && (this.C = this.w)
}
}
}, G.prototype.e = function (e) {
var t = (61440 & e) >> 12;
new this.J[t](e).e(this)
}, "undefined" != typeof window && (new G).v("AxjgB5MAnACoAJwBpAAAABAAIAKcAqgAMAq0AzRJZAZwUpwCqACQACACGAKcBKAAIAOcBagAIAQYAjAUGgKcBqFAuAc5hTSHZAZwqrAIGgA0QJEAJAAYAzAUGgOcCaFANRQ0R2QGcOKwChoANECRACQAsAuQABgDnAmgAJwMgAGcDYwFEAAzBmAGcSqwDhoANECRACQAGAKcD6AAGgKcEKFANEcYApwRoAAxB2AGcXKwEhoANECRACQAGAKcE6AAGgKcFKFANEdkBnGqsBUaADRAkQAkABgCnBagAGAGcdKwFxoANECRACQAGAKcGKAAYAZx+rAZGgA0QJEAJAAYA5waoABgBnIisBsaADRAkQAkABgCnBygABoCnB2hQDRHZAZyWrAeGgA0QJEAJAAYBJwfoAAwFGAGcoawIBoANECRACQAGAOQALAJkAAYBJwfgAlsBnK+sCEaADRAkQAkABgDkACwGpAAGAScH4AJbAZy9rAiGgA0QJEAJACwI5AAGAScH6AAkACcJKgAnCWgAJwmoACcJ4AFnA2MBRAAMw5gBnNasCgaADRAkQAkABgBEio0R5EAJAGwKSAFGACcKqAAEgM0RCQGGAYSATRFZAZzshgAtCs0QCQAGAYSAjRFZAZz1hgAtCw0QCQAEAAgB7AtIAgYAJwqoAASATRBJAkYCRIANEZkBnYqEAgaBxQBOYAoBxQEOYQ0giQKGAmQABgAnC6ABRgBGgo0UhD/MQ8zECALEAgaBxQBOYAoBxQEOYQ0gpEAJAoYARoKNFIQ/zEPkAAgChgLGgkUATmBkgAaAJwuhAUaCjdQFAg5kTSTJAsQCBoHFAE5gCgHFAQ5hDSCkQAkChgBGgo0UhD/MQ+QACAKGAsaCRQCOYGSABoAnC6EBRoKN1AUEDmRNJMkCxgFGgsUPzmPkgAaCJwvhAU0wCQFGAUaCxQGOZISPzZPkQAaCJwvhAU0wCQFGAUaCxQMOZISPzZPkQAaCJwvhAU0wCQFGAUaCxQSOZISPzZPkQAaCJwvhAU0wCQFGAkSAzRBJAlz/B4FUAAAAwUYIAAIBSITFQkTERwABi0GHxITAAAJLwMSGRsXHxMZAAk0Fw8HFh4NAwUABhU1EBceDwAENBcUEAAGNBkTGRcBAAFKAAkvHg4PKz4aEwIAAUsACDIVHB0QEQ4YAAsuAzs7AAoPKToKDgAHMx8SGQUvMQABSAALORoVGCQgERcCAxoACAU3ABEXAgMaAAsFGDcAERcCAxoUCgABSQAGOA8LGBsPAAYYLwsYGw8AAU4ABD8QHAUAAU8ABSkbCQ4BAAFMAAktCh8eDgMHCw8AAU0ADT4TGjQsGQMaFA0FHhkAFz4TGjQsGQMaFA0FHhk1NBkCHgUbGBEPAAFCABg9GgkjIAEmOgUHDQ8eFSU5DggJAwEcAwUAAUMAAUAAAUEADQEtFw0FBwtdWxQTGSAACBwrAxUPBR4ZAAkqGgUDAwMVEQ0ACC4DJD8eAx8RAAQ5GhUYAAFGAAAABjYRExELBAACWhgAAVoAQAg/PTw0NxcQPCQ5C3JZEBs9fkcnDRcUAXZia0Q4EhQgXHojMBY3MWVCNT0uDhMXcGQ7AUFPHigkQUwQFkhaAkEACjkTEQspNBMZPC0ABjkTEQsrLQ==");
var abc = function (e) {
console.log(__g._encrypt(encodeURIComponent(e)))
// return __g._encrypt(encodeURIComponent(e))
};
// function run(e) {
// return __g._encrypt(encodeURIComponent(e))
// }
// module.exports = {run}
得到id后就可以愉快获取需要的信息拉。
# 计算机科学与技术为例,拼接必要信息获得md5加密调用js获得id,下面是test
test = 'a0F8o7L8r8Yf6RY88RY0eAHqFBFYeLtyKXO02Ae0F9SX'
use_url = 'https://www.zhihu.com/api/v4/search_v3?t=general&q=计算机科学与技术&correction=1&offset=0&limit=20&lc_idx=0&show_all_topics=0'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
'cookie': 'd_c0=%s;' % (d_c),
# 'referer': referer,
'x-zse-93': '101_3_2.0',
'x-zse-96': '2.0_%s' % test, # js加密而来
}
res = requests.get(use_url, headers=headers)
不过最近知乎已经没有原来的大学和专业开设院校、就业之类的信息了。。。。。
4、梧桐果数据抓取
scrapy框架多层调用url的方法,见下面7吧。
5、Google Play app评论爬取
软件:西柚(seeyou)
主要模块:python 3.6的requests
利用requests获得网页源码后,难点主要在于信息提取。
信息提取两种方式:
- 文本转html,利用xpath提取(不适用于页面展示的源码和你请求的得到的源码不一致的情况,这次)
- 利用正则提取(相对于xpath,难度会大一点)。
Google Play app评论中的
用户打分、评论数据在AF_initDataCallback中key:\'ds:25\'对应的data里(不同的app ds对应的值不一样,注意找下规律),定位好所需信息位置,利用正则提取。
import re,demjson
import pandas as pd
test = open('../google_facebook.txt',encoding='utf8').read() # facebook的requests结果。
p = re.compile(r"AF_initDataCallback\({key: \'ds:25.+?, data:.+?, sideChannel: {}}\)",re.DOTALL)
tt = p.findall(test)
tt_dict = demjson.decode(tt[0].replace('AF_initDataCallback(','').replace(')',''))
test_res=[]
for i,v in enumerate(tt_dict ["data"][0]):
d = dict()
d['name'] = v[1][0]
d['star'] = v[2]
d['contents'] = v[4]
test_res.append(d)
pd.DataFrame(test_res)
6、py调用百度地图API获得经纬度
- 获取调去API的秘钥ak:
-
- 注册成为开发者。
- 创建应用(浏览器端,允许访问IP填*)
-
- 具体步骤如下图
- 利用requests模块调用获得输入地址的经纬度。
import requests
import json
def getGeo(p):
url = 'https://api.map.baidu.com/geocoding/v3/'
params = {'address': p,
'ak': '2gDvpo219bUG91R8VXGK5rhXYg7GKg8o', # 你创建的应用对应的百度密钥
'output': 'json'} # 输出结果设置为json格式
res = requests.get(url, params)
jd = json.loads(res.text)
return jd
7、利用scrapy 获取国家统计局5级行政区划分
http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/
2020年数据:http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html
7.1 创建一个工程
scrapy startproject getGeoInfoFromStatsGov
cd getGeoInfoFromStatsGov
scrapy genspider geoInfo stats.gov.cn
- -t:指定使用哪种模板,默认是“basic”
7.2 修改参数配置
7.2.1 pipelines
开始生成时,只有process_item这个函数,是用来接收item(主程序爬取return or yiled的数据)
为了写入文件,并减少文件的频繁打开与关闭,我们在process_item函数上下分别加入文件打开函数和文件关闭函数。
class GetgeoinfofromstatsgovPipeline:
def open_spider(self, spider):
# 打开文件
self.fp = open('../geoInfo.txt', 'a', encoding='utf8')
def process_item(self, item, spider):
self.fp.write(str(item)) # 数据写入
return item
def close_spider(self, spider):
# 关闭文件
self.fp.close()
7.2.2 items
定义你的变量Field实例。
你yield or return到的变量都要定义。
7.2.3 settings
这里是进行一些常规设置:
是否遵循机器人协议(遵循了还爬个鬼)
放开pipelines的注释(也可以写多个pipelines,只要在settings中注册,并给定相应的权重,默认的是300,这个数值你可以随意给,只要能体现出彼此之间的大小就行。)
定义控制台输出的日志信息等级(也可以不定义,只是控制台输出的信息会很多,包括debug级别、warning级别等)
中间件注册
操作数据库相关
分布式爬虫相关
这里我只做了日志等级修改及输出
放开pipelines注释。
7.2.4 middlewares
定义自己的中间件是要先搞清楚网络请求的内部过程的。这属于太高阶,一般没写过。
要写中间件:
- 首先在settings中放开中间件的注释,并把自己写的中间件注册(按已有的格式写好)
- 中间件的一个添加随机UA的示例
settings中的修改
如果没记错的话,应该是数越小越先执行。
7.2.5 爬取数据存入数据库
我没存过,看网上写的比较简单,可自行百度。
7.3 写主程序爬虫代码
geoInfo.py就是你的主程序,scrapy genspider geoInfo就是生成的这个文件,注意该名称不可与你的爬虫程序同名。
7.3.1 一级url爬取
自然生成的geoInfo.py中就只有一个 parse函数。
这就会把国家统计局这张图中的各个省份的名字提取并传到pipelines里。
显而易见,每个省份下都链接他的下一级信息页面也即市级别的信息,在市级别的信息又链接下一级别的县层级信息。。。这就涉及多级url爬取
7.3.2 多级url爬取
yield就要callback下一个函数去解析下一级的信息,并将该层级的信息传递出去。
利用scrapy.Request()
参数:
- url:就是下一层函数要请求的url
- callback:就是你要调起的下一级解析函数的名字
- meta:用来传递给下一级解析函数信息,字典的形式
具体代码如下
import scrapy
from ..items import GetgeoinfofromstatsgovItem
class GeoinfoSpider(scrapy.Spider):
name = 'geoInfo'
allowed_domains = ['stats.gov.cn']
start_urls = [
'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/'] # http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html
def parse(self, response):
firstLevels = response.xpath('//tr[@class="provincetr"]//td//a')
for i in firstLevels:
province = i.xpath('./text()').extract_first()
p_url = i.xpath('./@href').extract_first()
p_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/' + p_url
# if province in ['新疆维吾尔自治区', '云南省', '安徽省', '黑龙江省', '山西省', '河北省', '宁夏回族自治区', '四川省', '广东省', '内蒙古自治区', '上海市',
# '湖北省', '青海省', '广西壮族自治区', '辽宁省', '江苏省', '天津市', '北京市','甘肃省', '江西省', '福建省', '陕西省', '湖南省', '吉林省', '山东省','西藏自治区', '贵州省', '重庆市']:
# continue
yield scrapy.Request(url= p_url, callback=self.secondParse,
meta={'info': (province, p_url)})
def secondParse(self, response):
province, p_url = response.meta.get('info')
print(2, province, p_url)
cityLevels = response.xpath('//table[@class="citytable"]//tr[@class="citytr"]')
for c in cityLevels:
cityCode = c.xpath('./td[1]//text()').extract_first()
cityName = c.xpath('./td[2]//text()').extract_first()
cityUrl = c.xpath('./td[2]//@href').extract_first()
c_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/' + cityUrl
yield scrapy.Request(url=c_url, callback=self.thirdParse,
meta={'info': (province, cityCode, cityName, cityUrl)})
def thirdParse(self, response):
province, cityCode, cityName, cityUrl = response.meta.get('info')
cityUrl = cityUrl.strip().split('/')[0]
countyLevels = response.xpath('//table[@class="countytable"]//tr[@class="countytr"]')
for co in countyLevels:
countyCode = co.xpath('./td[1]//text()').extract_first()
countyName = co.xpath('./td[2]//text()').extract_first()
countyUrl = co.xpath('./td[2]//@href').extract_first()
if countyUrl:
co_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/' + cityUrl + '/' + countyUrl
yield scrapy.Request(url=co_url, callback=self.fourParse,
meta={'info': (
province, cityCode, cityName, countyCode, countyName, cityUrl, countyUrl)})
else:
geo = GetgeoinfofromstatsgovItem(province=province, cityCode=cityCode, cityName=cityName,
countyCode=countyCode, countyName=countyName)
yield geo
def fourParse(self, response):
province, cityCode, cityName, countyCode, countyName, cityUrl, countyUrl = response.meta.get('info')
countyUrl = countyUrl.strip().split('/')[0]
townLevels = response.xpath('//table[@class="towntable"]//tr[@class="towntr"]')
for t in townLevels:
townCode = t.xpath('./td[1]//text()').extract_first()
townName = t.xpath('./td[2]//text()').extract_first()
townUrl = t.xpath('./td[2]//@href').extract_first()
if townUrl:
t_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/' + cityUrl + '/' + countyUrl + '/' + townUrl
yield scrapy.Request(url=t_url, callback=self.fiveParse,
meta={'info': (
province, cityCode, cityName, countyCode, countyName, townCode, townName)})
else:
geo = GetgeoinfofromstatsgovItem(province=province, cityCode=cityCode, cityName=cityName,
countyCode=countyCode, countyName=countyName, townCode=townCode,
townName=townName)
yield geo
def fiveParse(self, response):
province, cityCode, cityName, countyCode, countyName, townCode, townName = response.meta.get('info')
villageLevels = response.xpath('//table[@class="villagetable"]//tr[@class="villagetr"]')
for v in villageLevels:
vCode = v.xpath('./td[1]/text()').extract_first()
vName = v.xpath('./td[3]/text()').extract_first()
geo = GetgeoinfofromstatsgovItem(province=province, cityCode=cityCode, cityName=cityName,
countyCode=countyCode, countyName=countyName, townCode=townCode,
townName=townName, vCode=vCode, vName=vName)
yield geo
7.4 数据清洗
import demjson
import pandas as pd
with open('E:/programDll/geoInfoNew.txt', 'a', encoding='utf8') as fp:
with open('E:/programDll/geoInfo.txt', encoding='utf8') as f:
for l in f.readlines():
l = l.strip()
if len(l.split('}')) == 2:
l1 = l.split('}')[0] + "}" + "\n"
fp.write(l1)
if l.split('}')[1]:
l2 = l.split('}')[1]
fp.write(l2)
else:
pass
elif l.split('{')[0] == "":
fp.write(l)
else:
fp.write(l)
geoInfo = pd.read_csv('E:/programDll/geoInfoNew.txt', header=None, sep='\n')
geoInfo.columns = ['info']
# geoInfo['info'] = geoInfo['info'].apply(lambda x: x.replace(',{', '{').replace(",,", ","))
geoInfo['info'] = geoInfo['info'].apply(lambda x: demjson.decode(x))
geoInfo['vCode'] = geoInfo['info'].apply(lambda x: x.get('vCode'))
geoInfo['vName'] = geoInfo['info'].apply(lambda x: x.get('vName'))
geoInfo['townCode'] = geoInfo['info'].apply(lambda x: x.get('townCode'))
geoInfo['townName'] = geoInfo['info'].apply(lambda x: x.get('townName'))
geoInfo['countyCode'] = geoInfo['info'].apply(lambda x: x.get('countyCode'))
geoInfo['countyName'] = geoInfo['info'].apply(lambda x: x.get('countyName'))
geoInfo['cityCode'] = geoInfo['info'].apply(lambda x: x.get('cityCode'))
geoInfo['cityName'] = geoInfo['info'].apply(lambda x: x.get('cityName'))
geoInfo['province'] = geoInfo['info'].apply(lambda x: x.get('province'))
geoInfo = geoInfo.iloc[:, 1:]
geoInfo.drop_duplicates(inplace=True)
geoInfo.to_csv('E:/programDll/getGeoInfoFromStatsGov/2020年五级地理位置划分.txt', index=None)
数据示例