author:Voccoo
time:2019-4-1
"""
1.本demo只是为了爬取指定公众号或指定关键字下公众号,限定时间内所发送的文章。
若要获取公众号信息,一并存取,请根据第一条gzhurl做以修改,或者从文章中直接获取
2.本demo只是匆忙间完成的,还有许多细节并不够完美。
比如对返回值为空的判断等。
若要使用请根据自己的需求加以修改
3.本次代理使用了redis来存储,这是作者自己的习惯存储方式。
若要使用,请自行修改方法redis_proxy().
4.‘用代理,就上芝麻IP!’
5.本demo只获取到文章的名称,更多信息请自行修改获取。
"""
from fake_useragent import UserAgent
import requests, time
from scrapy import Selector
import random
import redis, json
from urllib.parse import quote
# redis为ip池
# 从redis中获取ip
#
def redis_proxy():
redis_conn = redis.StrictRedis(host='localhost',
password='Cs123456.',
port=6379,
db=1
)
redis_ip = redis_conn.blpop('ips')
ip = json.loads(redis_ip[1].decode('UTF-8'))
proxy = {
'https': 'https://{}'.format(ip['ip'])
}
return proxy
# 获取html
def get_html_act(url, referer):
ua = UserAgent()
while True:
proxies = redis_proxy()
try:
headers = {
'User-Agent': ua.random,
'Upgrade-Insecure-Requests': '1',
}
session = requests.session()
session.get('https://mp.weixin.qq.com/',
headers=headers,
proxies=proxies,
timeout=3
)
html = requests.get(url,
headers=headers,
proxies=proxies,
# allow_redirects=False,
timeout=3)
if html.status_code == 200:
# print(html.text)
return Selector(text=html.text)
else:
print('---状态码---{}被封了---!'.format(proxies['https']))
except Exception as e:
print('-----超时抛错----')
# 获取html
def get_html(url, referer):
# print(url)
ua = UserAgent()
while True:
proxies = redis_proxy()
try:
headers = {
"Host": "weixin.sogou.com",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
'User-Agent': ua.random,
"ContentType": "text/xml;charset=utf-8",
'Referer': referer,
'Upgrade-Insecure-Requests': '1',
}
session = requests.session()
html = session.get(url,
headers=headers,
proxies=proxies,
allow_redirects=False,
timeout=3)
if html.status_code == 200:
return Selector(text=html.text)
else:
print('---状态码---{}被封了---!'.format(proxies['https']))
except Exception as e:
print('-----超时抛错----')
def run(gzh, start_time, endtime):
"""
:param gzh:
:param start_time:
:param endtime:
:return:
ps:
搜索公众号的url,为了获取wxid,搜索关键可以为关键字也可为公众号
若使用关键字搜索公众号的wxid,本demo只获取第一页的,请自行翻页获取
"""
gzh_url = 'https://weixin.sogou.com/weixin?type=1&s_from=input&query={}&ie=utf8&_sug_=n&_sug_type_='.format(
quote(gzh))
gzh_html = get_html(gzh_url, 'https://weixin.sogou.com/')
wxid_list = gzh_html.css('ul.news-list2 li::attr(d)').extract()
for wxid in wxid_list:
page_ = True
page_count = 1
url = 'https://weixin.sogou.com/weixin?type=2&ie=utf8&query={}&tsn=5&ft={}&et={}&interation=&wxid={}&usip={}&page={}'.format(
quote(gzh), start_time, endtime, wxid, quote(gzh), page_count)
referer = 'https://weixin.sogou.com/weixin?type=2&s_from=input&query={}&ie=utf8&_sug_=n&_sug_type_='.format(
quote(gzh))
while page_:
response = get_html(url, referer)
article_urls = response.css('div.news-box ul.news-list li div.txt-box h3 a::attr(data-share)').extract()
if len(article_urls) == 10:
print('--翻页--进入第{}页--'.format(page_count+1))
url = url.replace('&page={}'.format(page_count),'&page={}'.format(page_count+1))
page_count += 1
else:
page_ = False
for al in article_urls:
# print(al)
article_html = get_html_act(al, '')
article_name = article_html.css('#activity-name::text').extract_first()
if article_name:
# 输出当前页面链接文章名称
print(article_name.strip())
else:
print(al)
if __name__ == '__main__':
# 开始时间
start_time = '2019-03-01'
# 结束时间
endtime = '2019-04-01'
# 公众号,也可以为公众号关键字
gzh = '痴海'
run(gzh, start_time, endtime)