一、分析百度统计网站内容
1.打开网站,找到趋势分析,点击F12,找到元素内的数据
2.其次,打开网站,找到趋势分析,点击F12,找到网络内的数据
其中,里面的label标签被编码处理过,需要进行解码后才为中文;
3.最后需要找到数据的请求地址、方式和请求头,才可以将数据爬取下来
二、编写爬虫代码,包括获取文本、解析文本、清洗数据与保存数据
1.构建请求头,获取文本内容
def __init__(self):
self.url = 'https://tongji.baidu.com/web5/10000569924/trend/time?siteId=19477719'
self.headers = {
'Accept':'*/*',
'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Connection':'keep-alive',
'Host':'tongji.baidu.com',
'Referer':'https://tongji.baidu.com/main/overview/10000569924/trend/time?siteId=19477719',
'Sec-Ch-Ua':'"Not/A)Brand";v="99", "Microsoft Edge";v="115", "Chromium";v="115"',
'Sec-Ch-Ua-Mobile':'?0',
'Sec-Ch-Ua-Platform':'"Windows"',
'Sec-Fetch-Dest':'empty',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Site':'same-origin',
'Cookie':'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX',
'User-Agent':'XXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
}
def get_html(self):
html = requests.get(url=self.url,headers=self.headers).text
print(html)
爬取结果如下图:发现需要获取的数据在javascript文件里面,因此需要利用正则re根据其特点去提取所需要的数据
2.利用正则re、列表list以及字典dict提取整理数据并保存至sql server数据库中
def parse1_html(self,html):
L1,L2,D1 = [],[],{}
p = re.compile(r'"sourceList":(.*),"indexInfo"', re.MULTILINE)
p1 = p.findall(html)
s1 = p1[1].replace('{','').replace('}','').replace('[','').replace(']','').split(',')
for i in range(len(s1)):
if(s1[i].split(':')[0]=='"id"'):
L1.append(s1[i].split(':')[1])
elif (s1[i].split(':')[0]=='"label"'):
L2.append((s1[i].split(':')[1].replace('"','').replace('"','')).encode('utf-8').decode('unicode_escape'))
else:
continue
L3 = L1[3:len(L1)-1]
L3.insert(0,'0')
L4 = L2[3:len(L2)-1]
for j in range(len(L4)):
D1[L3[j]] = L4[j]
self.parse3_sql(D1,"searchsource")
def parse2_html(self,html):
L1,L2,D1 = [],[],{}
p = re.compile(r'"areaList":(.*),"visitorList"', re.MULTILINE)
p1 = p.findall(html)
s1 = p1[1].replace('{','').replace('}','').replace('[','').replace(']','').split(',')
for i in range(len(s1)):
if(s1[i].split(':')[0]=='"id"'):
L1.append(s1[i].split(':')[1])
elif (s1[i].split(':')[0]=='"label"'):
L2.append((s1[i].split(':')[1].replace('"','').replace('"','')).encode('utf-8').decode('unicode_escape'))
else:
continue
L3 = L1[3:len(L1)-1]
L3.insert(0,'1')
L4 = L2[3:len(L2)-1]
for j in range(len(L4)):
D1[L3[j]] = L4[j]
self.parse3_sql(D1,"areaList")
def parse3_sql(self,dic,types):
connect = pymssql.connect(server='XXXX', user='XXXX', password='XXXX', database='XXXX')
cursor = connect.cursor()
for key,value in dic.items():#datetime.datetime.now()
sql_select = " insert into Test1 (BiHao,Name,Type,InsertTime) values"+"('{}','{}','{}','{}')".format(key,value,types,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
try:
cursor.execute(sql_select)
except Exception as e:
print(e)
connect.rollback()
connect.commit()
cursor.close()
connect.close()
三、完整代码
#coding:utf-8
import requests
import re
import datetime
import pymssql
class RenrenLogin(object):
def __init__(self):
self.url = 'https://tongji.baidu.com/web5/10000569924/trend/time?siteId=19477719'
self.headers = {
'Accept':'*/*',
'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Connection':'keep-alive',
'Host':'tongji.baidu.com',
'Referer':'https://tongji.baidu.com/main/overview/10000569924/trend/time?siteId=19477719',
'Sec-Ch-Ua':'"Not/A)Brand";v="99", "Microsoft Edge";v="115", "Chromium";v="115"',
'Sec-Ch-Ua-Mobile':'?0',
'Sec-Ch-Ua-Platform':'"Windows"',
'Sec-Fetch-Dest':'empty',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Site':'same-origin',
'Cookie':'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX',
'User-Agent':'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
}
def get_html(self):
html = requests.get(url=self.url,headers=self.headers).text
self.parse1_html(html)
self.parse2_html(html)
def parse1_html(self,html):
L1,L2,D1 = [],[],{}
p = re.compile(r'"sourceList":(.*),"indexInfo"', re.MULTILINE)
p1 = p.findall(html)
s1 = p1[1].replace('{','').replace('}','').replace('[','').replace(']','').split(',')
for i in range(len(s1)):
if(s1[i].split(':')[0]=='"id"'):
L1.append(s1[i].split(':')[1])
elif (s1[i].split(':')[0]=='"label"'):
L2.append((s1[i].split(':')[1].replace('"','').replace('"','')).encode('utf-8').decode('unicode_escape'))
else:
continue
L3 = L1[3:len(L1)-1]
L3.insert(0,'0')
L4 = L2[3:len(L2)-1]
for j in range(len(L4)):
D1[L3[j]] = L4[j]
self.parse3_sql(D1,"searchsource")
def parse2_html(self,html):
L1,L2,D1 = [],[],{}
p = re.compile(r'"areaList":(.*),"visitorList"', re.MULTILINE)
p1 = p.findall(html)
s1 = p1[1].replace('{','').replace('}','').replace('[','').replace(']','').split(',')
for i in range(len(s1)):
if(s1[i].split(':')[0]=='"id"'):
L1.append(s1[i].split(':')[1])
elif (s1[i].split(':')[0]=='"label"'):
L2.append((s1[i].split(':')[1].replace('"','').replace('"','')).encode('utf-8').decode('unicode_escape'))
else:
continue
L3 = L1[3:len(L1)-1]
L3.insert(0,'1')
L4 = L2[3:len(L2)-1]
for j in range(len(L4)):
D1[L3[j]] = L4[j]
self.parse3_sql(D1,"areaList")
def parse3_sql(self,dic,types):
connect = pymssql.connect(server='XXXX', user='XXXX', password='XXXX', database='XXXX')
cursor = connect.cursor()
for key,value in dic.items():#datetime.datetime.now()
sql_select = " insert into Test1 (BiHao,Name,Type,InsertTime) values"+"('{}','{}','{}','{}')".format(key,value,types,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
try:
cursor.execute(sql_select)
except Exception as e:
print(e)
connect.rollback()
connect.commit()
cursor.close()
connect.close()
if __name__ == '__main__':
spider = RenrenLogin()
spider.get_html()
总结:数据提取需要根据数据特定结构进行处理