dalian_url

最新推荐文章于 2024-04-08 09:55:44 发布
会编程的漂亮小姐姐
最新推荐文章于 2024-04-08 09:55:44 发布
阅读量218
点赞数
分类专栏： Python
本文链接：https://blog.csdn.net/u014229742/article/details/84940638
版权
Python 专栏收录该内容
171 篇文章 2 订阅
订阅专栏
'''
1.本代码将所有关键字所有的百度快照拿下来
2.通过百度快照解析页面信息，获取两页链接、公司名称、联系我们等信息
3.百度搜索关键词有以下两种最优情况

        @'大连' and '技术支持:大连龙采'：
        https://www.baidu.com/s?ie=utf-8&wd='大连' and '技术支持:大连龙采'&pn=180
        url = "https://www.baidu.com/s?ie=utf-8&wd='{0}'+and+'{1}'&pn={2}".format(area,company,page)

        @'技术支持:祥云科技' and '大连'
        https://www.baidu.com/s?ie=utf-8&wd='技术支持:祥云科技' and '大连'&pn=0
        url = "https://www.baidu.com/s?ie=utf-8&wd='{0}'+and+'{1}'&pn={2}".format(company,area, page)

 本代码根据关键词：'大连' and '技术支持:大连龙采'：

'''
import random
from random import randint
import re
import time
from lxml import etree

import requests
from multiprocessing import Process
import time
from redis import Redis
# redis 数据库配置参数
# REDIS_HOST = '192.138.3.237'
REDIS_HOST = '47.135.133.33'
REDIS_PORT = 6379
# 外部配置
#REDIS_HOST = '222.232.90.33'
#REDIS_PORT = 6579
REDIS_DB = 12
REDIS_PASSWORD = '123456'
# REDIS_PASSWORD = 'spider'

redis_cilent = Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB,
                                  password=REDIS_PASSWORD)

s = requests.session()


areas = ['大连']

#all
# companys = ['技术支持:大连龙采', '技术支持:云网(大连）','技术支持:合众商道（大连）',
#             '技术支持:大连祥云','技术支持:大连致远'
#             ]

companys = ['技术支持:云网(大连）信息技术有限公司'
            ]



def get_random_proxy():
    """
    代理
    :return:
    # """
    REDIS_HOST = '222.232.90.33'
    REDIS_PORT = 6579
    REDIS_DB = 7
    REDIS_PASSWORD = 'spider'
    redis_ = Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB,
                   password=REDIS_PASSWORD)
    while True:
        try:
            proxies_values = redis_.lpop('proxy5').decode()
            break
        except:
            pass
    print(proxies_values)

    return proxies_values
def get_url():
    # 随机请求头
    my_headers = [
        "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
    ]

    value = get_random_proxy()
    proxies = {'http': 'http://' + re.findall('.*?([\d].*)', value)[0],
               'https': 'https://' + re.findall('.*?([\d].*)', value)[0]}


    # for company in companys:
    #     company = company
    #     print('company')
    #     print(company)
    #     for area in areas:
    #         for page in range(1,70):
    #             page = page * 10

                # url = "https://www.baidu.com/s?ie=utf-8&wd='{0}'+and+'{1}'&pn={2}".format(area,company,page)
    # for page in range(2):
    #     page = page*10
    # t=5
    for page in range(300,760, 10):
        #龙采科技
        # url = 'https://www.baidu.com/s?ie=utf-8&wd=%27%E5%A4%A7%E8%BF%9E%27%20and%20%27%E6%8A%80%E6%9C%AF%E6%94%AF%E6%8C%81:%E9%BE%99%E9%87%87%E7%A7%91%E6%8A%80%27&pn=' + str(
        #     page)
        # previous_url = 'https://www.baidu.com/s?ie=utf-8&wd=%27%E5%A4%A7%E8%BF%9E%27%20and%20%27%E6%8A%80%E6%9C%AF%E6%94%AF%E6%8C%81:%E9%BE%99%E9%87%87%E7%A7%91%E6%8A%80%27&pn=' + str(
        #     page - 10)

        #致远科技
        # url = 'https://www.baidu.com/s?ie=utf-8&wd=%27%E5%A4%A7%E8%BF%9E%27%20and%20%27%E6%8A%80%E6%9C%AF%E6%94%AF%E6%8C%81:%E8%87%B4%E8%BF%9C%E7%A7%91%E6%8A%80%27&pn=' + str(
        #     page)
        # previous_url = 'https://www.baidu.com/s?ie=utf-8&wd=%27%E5%A4%A7%E8%BF%9E%27%20and%20%27%E6%8A%80%E6%9C%AF%E6%94%AF%E6%8C%81:%E8%87%B4%E8%BF%9C%E7%A7%91%E6%8A%80%27&pn=' + str(
        #     page - 10)


        # 祥云科技大连
        # url = 'https://www.baidu.com/s?ie=utf-8&wd=%27%E5%A4%A7%E8%BF%9E%27%20and%20%27%E6%8A%80%E6%9C%AF%E6%94%AF%E6%8C%81:%E7%A5%A5%E4%BA%91%E7%A7%91%E6%8A%80%27&pn=' + str(
        #     page)
        # previous_url = 'https://www.baidu.com/s?ie=utf-8&wd=%27%E5%A4%A7%E8%BF%9E%27%20and%20%27%E6%8A%80%E6%9C%AF%E6%94%AF%E6%8C%81:%E7%A5%A5%E4%BA%91%E7%A7%91%E6%8A%80%27&pn=' + str(
        #     page - 10)

        #云网大连
        url = 'https://www.baidu.com/s?ie=utf-8&wd=%27%E5%A4%A7%E8%BF%9E%27%20and%20%27%E6%8A%80%E6%9C%AF%E6%94%AF%E6%8C%81:%E4%BA%91%E7%BD%91(%E5%A4%A7%E8%BF%9E)%E4%BF%A1%E6%81%AF%E6%8A%80%E6%9C%AF%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8%27&pn='+str(page)
        previous_url = 'https://www.baidu.com/s?ie=utf-8&wd=%27%E5%A4%A7%E8%BF%9E%27%20and%20%27%E6%8A%80%E6%9C%AF%E6%94%AF%E6%8C%81:%E4%BA%91%E7%BD%91(%E5%A4%A7%E8%BF%9E)%E4%BF%A1%E6%81%AF%E6%8A%80%E6%9C%AF%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8%27&pn='+str(page-10)

        #合众商道
        # url = 'https://www.baidu.com/s?ie=utf-8&wd=%27%E5%A4%A7%E8%BF%9E%27%20and%20%27%E6%8A%80%E6%9C%AF%E6%94%AF%E6%8C%81:%E5%90%88%E4%BC%97%E5%95%86%E9%81%93(%E5%A4%A7%E8%BF%9E)%27&pn=' + str(
        #     page)
        # previous_url = 'https://www.baidu.com/s?ie=utf-8&wd=%27%E5%A4%A7%E8%BF%9E%27%20and%20%27%E6%8A%80%E6%9C%AF%E6%94%AF%E6%8C%81:%E5%90%88%E4%BC%97%E5%95%86%E9%81%93(%E5%A4%A7%E8%BF%9E)%27&pn=' + str(
        #     page - 10)

        # url = 'https://www.baidu.com/s?ie=utf-8&wd=%27%E5%A4%A7%E8%BF%9E%27+and+%27%E6%8A%80%E6%9C%AF%E6%94%AF%E6%8C%81:%E5%A4%A7%E8%BF%9E%E9%BE%99%E9%87%87%27&pn={}'.format(page)
        headers = {
                'User-Agent': random.choice(my_headers), 'referer': previous_url
            }

        # if (t % 5 == 0): # each proxy check 3 page
        #
        # t=t+1
        global e
        e=0
        retryurl=1
        while True:
            if e==1:
                value = get_random_proxy()
                proxies = {'http': 'http://' + re.findall('.*?([\d].*)', value)[0],
                           'https': 'https://' + re.findall('.*?([\d].*)', value)[0]}
                e=0
            try:
                time.sleep(randint(20, 30))
                res = s.get(url, verify=False, allow_redirects=False, headers=headers, timeout=40,proxies=proxies)
                retryurl=retryurl+1
                #res = s.get(url, verify=False, allow_redirects=False, headers=headers, timeout=40)
                print('urlllllllllllllll')
                print(url)
                print(proxies)

            except Exception as ex:
                print(ex)
                e = 1
                print('change proxy if error')
                continue
            if e==1:
                continue
            # print(res.text)
            html = etree.HTML(res.text)
            urls = html.xpath('..//a[contains(text(),"百度快照")]')
            # urls = html.xpath('//div[@class="result c-container "]/h3/a')#//*[@id="1"]
            i=1

            if len(urls)!=10:
                if retryurl < 4:
                    continue
            for url in urls:
                # 输出所有快照的链接
                print(i)
                print(' ')
                i=i+1
                kuaizhao = url.xpath('@href')
                print(kuaizhao[0])
                redis_cilent.sadd('baidu_url_yunwang', kuaizhao[0])
            break






if __name__ == '__main__':
    while True:
        try:
            T = Process(target=get_url)
            T.start()
            T.join()
        except:
            time.sleep(20)
            print('重启')
            pass