BeatifulSoup网页解析
#-*- coding:utf-8 -*-
#Filename:360搜索引擎
#Author:Guan
#Datetime:2018/11/30
import requests
from bs4 import BeautifulSoup
import json
import time
def get_html(url):
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36",
"Cookie":"Q=u%3D360H3096670450%26n%3D%26le%3D%26m%3DZGZmWGWOWGWOWGWOWGWOWGWOZwHl%26qid%3D3096670450%26im%3D1_t01c37c6928fc149034%26src%3D360index%26t%3D1; T=s%3Dccd759892e1b135a3419e01e23177efa%26t%3D1542679014%26lm%3D%26lf%3D%26sk%3D34a25b6ef06eb6640f916a7ac7dd02ba%26mt%3D1542679014%26rc%3D%26v%3D2.0%26a%3D1; QiHooGUID=45A4F6333A9E13202FF582D464E8CB16.1543578963037; __guid=15484592.4007070523124616700.1543578964544.7502; webp=1; stc_ls_sohome=RGzW2OYRKV!3TRXVhIMSWA; __huid=11ZSgWXOw0Wun4Is5XEqKzQ7U4mjrXUxDivKDEINKN3pU%3D; gtHuid=1; dpr=1.25; count=3; _pp_wd=1; erules=p1-14%7Cecr-3%7Cp4-14%7Cp2-5%7Cp3-6"
}
response=requests.get(url=url,headers=headers).content.decode()
# print(response)
return response
def get_cont(html):
soup = BeautifulSoup(html,'lxml')
url_list = soup.select('.res-list')
new_list =[]
#爬虫规则
for i in url_list:
new_dict={}
new_dict['title'] = i.find_all('h3')[0].text.strip()
cont2 = i.select('div[class="res-rich so-rich-news clearfix"]')
for j in cont2:
# print(j.text.strip())
new_dict['cont_two'] = j.text.strip()
new_dict['cont_one'] = i.find_all('p')[0].text.strip()
new_list.append(new_dict)
# print(new_list)
for str in new_list:
new_cont = json.dumps(str,ensure_ascii=False)
print(new_cont)
with open('D:\\公司文件\\.PyCharmCE2018.2\\config\\scratches\\拓展\\360引擎\\360搜索引擎数据','a',encoding='utf-8')as f:
f.write(new_cont+'\n')
if __name__ == '__main__':
#取车型
file = open('D:\\公司文件\\.PyCharmCE2018.2\\config\\scratches\\拓展\\360引擎\\360车型配置文件')
content = file.readlines()
chexing = []
for i in content:
new_chexing = i.split(',')
for j in new_chexing:
chexing.append(j)
#翻页
for i in range(1,90):
for j in chexing:
url = 'https://www.so.com/s?q=%s'%j+'&pn=%d'%i
print('正在获取%s车型的%d页的数据'%(j,i))
html=get_html(url)
time.sleep(2)
get_cont(html)
print('代码执行完毕')