爬虫发去搜狗问问,根据相关性排序 组合文章,保存本地,阅读性很高
# encoding='utf-8'
# coding: utf-8
# Author: 小章哥儿
# Date: 2021-08-03
from lxml import etree
import re
import requests
import time
class Sogou():
def __init__(self):
return
def get_html(self, keyword):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"}
url = f'https://www.sogou.com/sogou?query={
keyword}&ie=utf8&insite=wenwen.sogou.com'
html = requests.get(url, headers=headers)
return html.text
def collect_urls(self, keyword):
"""
采集问答关键词前三个标题和链接,列表 元祖形式
"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"}
html = self.get_html(keyword)
selector = etree.HTML(html)
questions = [i.xpath('string(.)').replace('搜狗问问', '').replace('搜狗', '').replace('-', '') for i in
selector.xpath(