基于百度,360首页的广告推广的URL收集.
转载标明出处。
# -*- coding: gbk -*-
import csv
import os
import re
import time
from urllib import parse
import requests
import urllib3
# 解决InsecureRequestWarning 报错。
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
class Bdcrjy:
def __init__(self):
# pc端-360
self.tszoneUrl = 'https://www.so.com/s?ie=utf-8&fr=none&src=360sou_newhome&q={}'
self.tszheaders = {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
}
# WAP端-360
self.tsztwoUrl = 'https://m.so.com/index.php?ie=utf-8&fr=none&src=360sou_newhome&q={}'
self.tsztwoHeaders = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Mobile Safari/537.36',
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
}
# PC端-百度
self.baiduoneUrl = 'https://www.baidu.com/s?wd={}'
self.baiduheaders = {
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
'Accept-Encoding': "gzip, deflate, br",
'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36",
}
# WAP端-百度
self.baidutwoUrl = 'https://m.baidu.com/ssid=25d43239c4c7ccec2253/s?word={}'
self.baidutwoHeaders = {
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
'Accept-Encoding': "gzip, deflate",
'Accept-Language': "zh-CN,zh;q=0.9",
'Referer': "http://m.baidu.com/?tn=&from=",
'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36",
}
# 快代理-------------------------------------------------------------------------
# 跳转全国的ip。
# 隧道服务器
tunnel_host = "IP"
tunnel_port = "端口"
# 隧道用户名密码
tid = "账号"
password = "密码"
self.proxies = {
"http": "http://%s:%s@%s:%s/" % (tid, password, tunnel_host, tunnel_port),
"https": "https://%s:%s@%s:%s/" % (tid, password, tunnel_host, tunnel_port)
}
# 快代理-------------------------------------------------------------------------
def get_html(self, url, headers):
html = requests.get(url=url, headers=headers, verify=False, proxies=self.proxies, stream=True)
if html.status_code == 200:
html.enconding = html.apparent_encoding
return html.text
def run_func(self, url, filename=None, search=None, headers=None, re_1=None, platform=None):
"""
主体函数。略略略
:param url: 运行的url
:param filename: 文件路径
:param search: 搜索器
:param headers: 当前url适配的headers
:param re_1: 当前页面适配的正则表达式
:param platform: 平台(wap or pc)
:return: csv文件
"""
pc_word = parse.unquote(url.split("=")[-1])
# pc_word=re.findall(r'(\W+)', parse.unquote(url)) 没有验证 先将URL转换再用正则提取汉子(\w+的方法。)
html = "".join(self.get_html(url=url, headers=headers))
xpath_list = list(set(re.findall(re_1, html, re.S)))
if xpath_list:
print("当前平台:" + search + "-" + platform, "--当前搜索关键词:", pc_word, "去重后的链接数量:" + str(len(xpath_list)))
if not os.path.exists(filename):
os.makedirs(filename)
# pc_word 自己控制
with open(filename + search + "-" + pc_word + ".csv", 'a+', encoding='utf-8-sig', newline='') as myFile:
csv_writer = csv.writer(myFile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL)
for item in xpath_list:
item_url = parse.unquote(item)
if item_url or "自行写判断条件":
continue
else:
csv_writer.writerow([item_url])
print("未处理的写入完成", "未处理的URL:", item_url)
def run_360_baidu(self, url):
for i in range(1):
try:
if url.startswith('https://www.so.com'):
self.run_func(url=url, filename='路径' + "/", search="搜索器", headers="Request Headers", re_1="正则表达式-自行尝试", platform="确定平台")
elif url.startswith('https://m.so.com'):
self.run_func(url=url, filename='路径' + "/", search="搜索器", headers="Request Headers", re_1="正则表达式-自行尝试", platform="确定平台")
elif url.startswith('https://www.baidu.com'):
self.run_func(url=url, filename='路径' + "/", search="搜索器", headers="Request Headers", re_1="正则表达式-自行尝试", platform="确定平台")
elif url.startswith('https://m.baidu.com'):
self.run_func(url=url, filename='路径' + "/", search="搜索器", headers="Request Headers", re_1="正则表达式-自行尝试", platform="确定平台")
except Exception as e:
print(e)
continue
if __name__ == '__main__':
spider = Bdcrjy()
list_Url = []
start_time = time.time()
Thread_list = []
Process_list = []
list_guanjianci = ["相关词条"]
for item in list_guanjianci:
url_1 = spider.tszoneUrl.format(parse.quote(item))
url_2 = spider.tsztwoUrl.format(parse.quote(item))
url_3 = spider.baiduoneUrl.format(parse.quote(item))
url_4 = spider.baidutwoUrl.format(parse.quote(item))
list_Url.append(url_1)
list_Url.append(url_2)
list_Url.append(url_3)
list_Url.append(url_4)
# 进程池-------------------------
# from multiprocessing import Pool
# with Pool(5)as mp:
# mp.map(spider.run_360_baidu, list_Url)
# 多线程-------------------------
# import threading
# for item in list_Url:
# t1 = threading.Thread(target=spider.run_360_baidu, args=(item,))
# Thread_list.append(t1)
# t1.start()
# for item_1 in Thread_list:
# item_1.join()
# 多进程-------------------------
# from multiprocessing import Process
# for item in list_Url:
# t1 = Process(target=spider.run_360_baidu, args=(item,))
# Process_list.append(t1)
# t1.start()
# for item_1 in Process_list:
# item_1.join()
# 线程池-------------------------
# from concurrent.futures import ThreadPoolExecutor
#
# with ThreadPoolExecutor(max_workers=32) as executor:
# executor.map(spider.run_360_baidu, list_Url, timeout=None)
# end_time = time.time()
# print('花费了%d秒' % (end_time - start_time))
转载标明出处。
禁止商用,你复制代码后,修改运行造成的一切负面影响和本人无关。
欢迎转载。