'''
1.本代码将所有关键字所有的百度快照拿下来
2.通过百度快照解析页面信息,获取两页链接、公司名称、联系我们等信息
3.百度搜索关键词有以下两种最优情况
@'大连' and '技术支持:大连龙采':
https://www.baidu.com/s?ie=utf-8&wd='大连' and '技术支持:大连龙采'&pn=180
url = "https://www.baidu.com/s?ie=utf-8&wd='{0}'+and+'{1}'&pn={2}".format(area,company,page)
@'技术支持:祥云科技' and '大连'
https://www.baidu.com/s?ie=utf-8&wd='技术支持:祥云科技' and '大连'&pn=0
url = "https://www.baidu.com/s?ie=utf-8&wd='{0}'+and+'{1}'&pn={2}".format(company,area, page)
本代码根据关键词:'大连' and '技术支持:大连龙采':
'''
import random
from random import randint
import re
import time
from lxml import etree
import requests
from multiprocessing import Process
import time
from redis import Redis
# redis 数据库配置参数
# REDIS_HOST = '192.138.3.237'
REDIS_HOST = '47.135.133.33'
REDIS_PORT = 6379
# 外部配置
#REDIS_HOST = '222.232.90.33'
#REDIS_PORT = 6579
REDIS_DB = 12
REDIS_PASSWORD = '123456'
# REDIS_PASSWORD = 'spider'
redis_cilent = Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB,
password=REDIS_PASSWORD)
s = requests.session()
areas = ['大连']
#all
# companys = ['技术支持:大连龙采', '技术支持:云网(大连)','技术支持:合众商道(大连)',
# '技术支持:大连祥云','技术支持:大连致远'
# ]
companys = ['技术支持:云网(大连)信息技术有限公司'
]
def get_random_proxy():
"""
代理
:return:
# """
REDIS_HOST = '222.232.90.33'
REDIS_PORT = 6579
REDIS_DB = 7
REDIS_PASSWORD = 'spider'
redis_ = Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB,
password=REDIS_PASSWORD)
while True:
try:
proxies_values = redis_.lpop('proxy5').decode()
break
except:
pass
print(proxies_values)
return proxies_values
def get_url():
# 随机请求头
my_headers = [
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
]
value = get_random_proxy()
proxies = {'http': 'http://' + re.findall('.*?([\d].*)', value)[0],
'https': 'https://' + re.findall('.*?([\d].*)', value)[0]}
# for company in companys:
# company = company
# print('company')
# print(company)
# for area in areas:
# for page in range(1,70):
# page = page * 10
# url = "https://www.baidu.com/s?ie=utf-8&wd='{0}'+and+'{1}'&pn={2}".format(area,company,page)
# for page in range(2):
# page = page*10
# t=5
for page in range(300,760, 10):
#龙采科技
# url = 'https://www.baidu.com/s?ie=utf-8&wd=%27%E5%A4%A7%E8%BF%9E%27%20and%20%27%E6%8A%80%E6%9C%AF%E6%94%AF%E6%8C%81:%E9%BE%99%E9%87%87%E7%A7%91%E6%8A%80%27&pn=' + str(
# page)
# previous_url = 'https://www.baidu.com/s?ie=utf-8&wd=%27%E5%A4%A7%E8%BF%9E%27%20and%20%27%E6%8A%80%E6%9C%AF%E6%94%AF%E6%8C%81:%E9%BE%99%E9%87%87%E7%A7%91%E6%8A%80%27&pn=' + str(
# page - 10)
#致远科技
# url = 'https://www.baidu.com/s?ie=utf-8&wd=%27%E5%A4%A7%E8%BF%9E%27%20and%20%27%E6%8A%80%E6%9C%AF%E6%94%AF%E6%8C%81:%E8%87%B4%E8%BF%9C%E7%A7%91%E6%8A%80%27&pn=' + str(
# page)
# previous_url = 'https://www.baidu.com/s?ie=utf-8&wd=%27%E5%A4%A7%E8%BF%9E%27%20and%20%27%E6%8A%80%E6%9C%AF%E6%94%AF%E6%8C%81:%E8%87%B4%E8%BF%9C%E7%A7%91%E6%8A%80%27&pn=' + str(
# page - 10)
# 祥云科技大连
# url = 'https://www.baidu.com/s?ie=utf-8&wd=%27%E5%A4%A7%E8%BF%9E%27%20and%20%27%E6%8A%80%E6%9C%AF%E6%94%AF%E6%8C%81:%E7%A5%A5%E4%BA%91%E7%A7%91%E6%8A%80%27&pn=' + str(
# page)
# previous_url = 'https://www.baidu.com/s?ie=utf-8&wd=%27%E5%A4%A7%E8%BF%9E%27%20and%20%27%E6%8A%80%E6%9C%AF%E6%94%AF%E6%8C%81:%E7%A5%A5%E4%BA%91%E7%A7%91%E6%8A%80%27&pn=' + str(
# page - 10)
#云网大连
url = 'https://www.baidu.com/s?ie=utf-8&wd=%27%E5%A4%A7%E8%BF%9E%27%20and%20%27%E6%8A%80%E6%9C%AF%E6%94%AF%E6%8C%81:%E4%BA%91%E7%BD%91(%E5%A4%A7%E8%BF%9E)%E4%BF%A1%E6%81%AF%E6%8A%80%E6%9C%AF%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8%27&pn='+str(page)
previous_url = 'https://www.baidu.com/s?ie=utf-8&wd=%27%E5%A4%A7%E8%BF%9E%27%20and%20%27%E6%8A%80%E6%9C%AF%E6%94%AF%E6%8C%81:%E4%BA%91%E7%BD%91(%E5%A4%A7%E8%BF%9E)%E4%BF%A1%E6%81%AF%E6%8A%80%E6%9C%AF%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8%27&pn='+str(page-10)
#合众商道
# url = 'https://www.baidu.com/s?ie=utf-8&wd=%27%E5%A4%A7%E8%BF%9E%27%20and%20%27%E6%8A%80%E6%9C%AF%E6%94%AF%E6%8C%81:%E5%90%88%E4%BC%97%E5%95%86%E9%81%93(%E5%A4%A7%E8%BF%9E)%27&pn=' + str(
# page)
# previous_url = 'https://www.baidu.com/s?ie=utf-8&wd=%27%E5%A4%A7%E8%BF%9E%27%20and%20%27%E6%8A%80%E6%9C%AF%E6%94%AF%E6%8C%81:%E5%90%88%E4%BC%97%E5%95%86%E9%81%93(%E5%A4%A7%E8%BF%9E)%27&pn=' + str(
# page - 10)
# url = 'https://www.baidu.com/s?ie=utf-8&wd=%27%E5%A4%A7%E8%BF%9E%27+and+%27%E6%8A%80%E6%9C%AF%E6%94%AF%E6%8C%81:%E5%A4%A7%E8%BF%9E%E9%BE%99%E9%87%87%27&pn={}'.format(page)
headers = {
'User-Agent': random.choice(my_headers), 'referer': previous_url
}
# if (t % 5 == 0): # each proxy check 3 page
#
# t=t+1
global e
e=0
retryurl=1
while True:
if e==1:
value = get_random_proxy()
proxies = {'http': 'http://' + re.findall('.*?([\d].*)', value)[0],
'https': 'https://' + re.findall('.*?([\d].*)', value)[0]}
e=0
try:
time.sleep(randint(20, 30))
res = s.get(url, verify=False, allow_redirects=False, headers=headers, timeout=40,proxies=proxies)
retryurl=retryurl+1
#res = s.get(url, verify=False, allow_redirects=False, headers=headers, timeout=40)
print('urlllllllllllllll')
print(url)
print(proxies)
except Exception as ex:
print(ex)
e = 1
print('change proxy if error')
continue
if e==1:
continue
# print(res.text)
html = etree.HTML(res.text)
urls = html.xpath('..//a[contains(text(),"百度快照")]')
# urls = html.xpath('//div[@class="result c-container "]/h3/a')#//*[@id="1"]
i=1
if len(urls)!=10:
if retryurl < 4:
continue
for url in urls:
# 输出所有快照的链接
print(i)
print(' ')
i=i+1
kuaizhao = url.xpath('@href')
print(kuaizhao[0])
redis_cilent.sadd('baidu_url_yunwang', kuaizhao[0])
break
if __name__ == '__main__':
while True:
try:
T = Process(target=get_url)
T.start()
T.join()
except:
time.sleep(20)
print('重启')
pass
dalian_url
最新推荐文章于 2024-04-08 09:55:44 发布