# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""
import requests
from requests.exceptions import ConnectionError
from pyquery import PyQuery as pq
#keyword = '风景'
base_url = 'https://weixin.sogou.com/weixin?query={}&s_from=input&type=2&page={}&ie=utf8'
headers = {
'Cookie':'SMYUV=156867c543fede325; sct=2',
'Host':'weixin.sogou.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'
}
#def get_proxy()
def get_html(url):
try:
response =requests.get(url,allow_redirects=False,headers=headers)
if response.status_code == 200:
return response.text
print('200')
if response.status_code == 302:
print('302')
if response.status_code == 404:
print('404')
except ConnectionError:
return get_html(url)
def get_index(keyword,page):
# data = {
# 'query':keyword,
# 'type':2,
# 'page':page
# }
# queries = urlencode(data)
url = base_url.format(keyword,page)
html = get_html(url)
doc = pq(html)
items = doc('.news-list li .txt-box h3 a').items()
for i in items:
print(i.attr('data-share'))
# items1 = doc('.txt-box h3 p').text()
print(items)
#def parse_index(html):
# doc = pq(html)
# items = doc('.news-box .news-list li .txt-box h3 a').text()
## for item in items:
## yield item.attr('href')
# return{
# 'item':items}
if __name__ == '__main__':
get_index('风景',10)
requests爬虫搜狗微信公众号
最新推荐文章于 2024-05-13 15:37:43 发布