项目描述:流量规则变动,需要启动百度SEM影像中的行业定投模块,由于收集url速度过慢不好统计数据,所以看这个看那个的写了个怕url的小代码,之后会贴上文本分析的内容,待更新~
使用语言:python3.7
# -*- coding:utf-8 -*-
'''
从百度把前n页的搜索到的url(自然结果)爬取保存
'''
from bs4 import BeautifulSoup # 处理抓到的页面
import sys
import requests
import importlib
importlib.reload(sys) # 编码转换,python3默认utf-8,一般不用加
from urllib import request
import urllib
import time
all = open('test.txt', 'a')
all.seek(0) #文件标记到初始位置
all.truncate() #清空文件
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Cookie': 'BIDUPSID=26F3A8FFEBB29CC21E3F45461769D121; PSTM=1563005786; BD_UPN=12314753; __cfduid=d3c8df9329cfd92708783ef57382736f91571370130; BAIDUID=7C23A308E60857D748A6570C7D29ACC4:FG=1; sugstore=0; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; yjs_js_security_passport=6a464703ed15f8ec7d2fa494139f854d62e2ab60_1574238636_js; H_WISE_SIDS=135670_137151_137734_127760_128698_134727_137413_135846_120170_138444_138002_137978_132911_138144_137692_131246_132551_137745_136681_118884_118871_118846_118823_118791_136688_107318_136431_136094_138478_133352_137900_136863_138148_138325_129649_136196_124636_133847_138343_137468_134047_129643_131423_138511_137970_137465_137703_110085_127969_137912_137829_138544_127417_136635_137208_138318_137449; lsv=globalT_androidcss_ac15436-wwwT_androidcss_8226982-searchboxcss_591d86b-globalBcss_565c244-wwwBcss_777000e; MSA_WH=400_635; shifen[149822088569_73399]=1574320676; shifen[92445354938_1520]=1574320684; shifen[136950954973_31508]=1574320688; BCLID=10572530311020761238; BDSFRCVID=A-uOJeC62rao1ZrwcG-PhF7bSfF0fE5TH6aVsRCZ9Bb6m3hmOnHdEG0PHx8g0KAb0xm0ogKK0mOTHv-F_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tJAe_C-KJK-3fP36qRbHMJFe5fob-C6X56Ps_JAMBhcqEIL4KxRn-TLS5UbxexJ33JvabMQ7BI38VxbSj4QoK5_Oh2cXX46DaJ4O3bb1-q5nhMJ33j7JDMP0-G5h3hoy523ion6vQpn-KqQ3DRoWXPIqbN7P-p5Z5mAqKl0MLPbtbb0xb6_0j6oQjN_DJTnjJTRe0Rj-KbP_hDL9eKTjhPrMy-LOWMT-0bFHhq-yL-oxfnOjqfQsQ6-V0P7MLq4OWHn7_JjC3R3cJ4cxjtkVKR_Ee-nG5xQxtNRB2CnjtpvNKqRoM6bobUPUDMc9LUvqHmcdot5yBbc8eIna5hjkbfJBQttjQn3hfIkj2CKLtDDBMIPGjjL35n-WqxT-Kbjyb6R-0PK8a-oJbpDz5fnkbfJBDl5K25oz-jv2K56x2pLbHxOIQ63tQjL7yajKLJJN0aL85J_KaCOsDq7ky-OpQT8ryhAOK5Oi0CujXnCMab3vOIJzXpO15CuzBN5thURB2DkO-4bCWJ5TMl5jDh3Mb6ksD-FtqtJHKbDq_KIX3f; delPer=0; BD_HOME=0; H_PS_PSSID=1463_21094_29568_29699_29220_26350_22158; BD_CK_SAM=1; PSINO=1; H_PS_645EC=4c16EYZPaBgfi5uofCq9q8HCvj0uqfFlZg8lNra3n4U9xjQJan9VdGp0syM; BDSVRTM=135; COOKIE_SESSION=1964_1_9_7_0_12_0_3_8_5_5_5_0_0_0_0_0_1574318712_1574320702%7C9%23207_87_1574316693%7C9',
'Host':'www.baidu.com',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'none',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'Accept-Encoding': 'identity'
} # 定义头文件,伪装成浏览器
def getfromBaidu(word):
url = 'http://www.baidu.com.cn/s?wd=' + urllib.parse.quote(word) + '&pn=' # word为关键词,pn是百度用来分页的..
for k in range(1, 2):
geturl(url, k)
def geturl(url, k):
path = url + str((k - 1) * 10)
req = urllib.request.Request(url=path, headers=headers)
page = urllib.request.urlopen(req).read().decode('utf-8')
soup = BeautifulSoup(page, 'lxml')
tagh3 = soup.find_all('h3')
for h3 in tagh3:
if h3 is None:
print("未能获取页面")
continue
else:
try:
landurl = h3.find('a').get('data-landurl')
if landurl is not None:
print("data-landurl:", landurl)
else:
href = h3.find('a').get('href')
try:
baidu_url = requests.get(url=href, headers=headers, allow_redirects=False)
real_url = baidu_url.headers['Location'] # 得到网页原始地址
if real_url.startswith('http'):
print("real_url:", real_url)
except:
print("未能找到网址")
continue
except:
print("无法获得landurl")
continue
if __name__ == '__main__':
f = open(r"C:\Users\果果\Desktop\园长定投搜索词库.txt","r",encoding='utf-8')
words = f.readlines()
j = 1
for i in words:
print(j)
j = j + 1
print(i.rstrip("\n"))
getfromBaidu(i.rstrip("\n"))
f.close()
notebook结果展示:
关于手机端的爬取教程在另一篇博客,之后放到CSDN上