import re
import ssl
import requests
import urllib3
from w3lib.html import remove_tags
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
}
ssl._create_default_https_context = ssl._create_unverified_context
urllib3.disable_warnings()
# 特殊字符替换
def replace_entity(str):
CHAR_ENTITIES_dict = {
" ": "",
" ": "",
" ": "",
"<": "<",
">": ">",
"&": "&",
""": '"',
"“": "“",
"”": "”",
"©": "©",
"®": "™",
"×": "×",
"÷": "÷",
}
for i in list(CHAR_ENTITIES_dict.keys
爬虫# 只保留<span><p><div>标签,去除各种样式,将div、span、标签处理为p标签,
最新推荐文章于 2023-03-05 18:49:21 发布