某汽车网站爬虫

主要使用到了ip池、模拟浏览器自动化检测、时间推送、数据库链接等技术
具体代码:
`# --coding:utf-8 --

Filename:汽车之家论坛

Author:Guan

Datetime:2018/12/27

from selenium import webdriver
import re
from _md5 import md5
import happybase
import datetime
import time
from selenium.webdriver.common.proxy import Proxy
from selenium.webdriver.common.proxy import ProxyType
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import random

HBASE_HOST = Your host address
HBASE_PORT = Your port
HBASE_TABLE = Your table

host = HBASE_HOST
port = HBASE_PORT
table_name = HBASE_TABLE

connection = happybase.Connection(host=host, port=port, timeout=None, autoconnect=False)

def get_ip():
# set request ip
proxie = []
ipfile = open(“ipforautohome.txt”, encoding=‘utf-8’)
# read in txt
ipagent = ipfile.readlines()
for ip in ipagent:
print(ip)
proxie.append(ip.strip())
print(proxie)
# control next page
while True:

    # random choice ip
    ip = random.choice(proxie)
    print("代理IP:", ip)

    # add proxyType
    print("代理类型:", ProxyType.MANUAL)

    try:

        # create proxy 
        proxy = Proxy({
            # 代理方式
            'proxyType': ProxyType.MANUAL,
            # HTTP代理(即IP)
            'httpProxy': ip
        })
        # create new proxy
        desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
        #new desired for proxy
        proxy.add_to_capabilities(desired_capabilities)

        # create driver
        driver1 = webdriver.Chrome(
            desired_capabilities=desired_capabilities
        )

        return driver1

    except Exception as e:
        # requestIp error
        print("代理IP请求失败" + ip)

def get_cont(url, chexing):
driver.get(url)

autohome_luntan_list = []

for index in range(1, 108):

    autohome_luntan_dict = {}
    try:
        # car_id
        autohome_luntan_dict['chexing_id'] = chexing
        # title
        autohome_luntan_dict['title'] = driver.find_element_by_xpath(
            '//*[@id="subcontent"]/dl[%d]/dt/a[1]' % index).text.strip()

        # title_url
        autohome_luntan_dict['title_url'] = driver.find_element_by_xpath(
            '//*[@id="subcontent"]/dl[%d]/dt/a' % index).get_attribute('href')

        # titleid
        titleUrl_result = re.split('/', driver.find_element_by_xpath(
            '//*[@id="subcontent"]/dl[%d]/dt/a' % index).get_attribute('href'))
        autohome_luntan_dict['title_urlid1'] = titleUrl_result[5]
        autohome_luntan_dict['title_urlid2'] = re.sub('.html', '', titleUrl_result[6])

        # pub_writer
        autohome_luntan_dict['author'] = driver.find_element_by_xpath(
            '//*[@id="subcontent"]/dl[%d]/dd[1]/a' % index).text.strip()

        #writer_url
        autohome_luntan_dict['author_url'] = driver.find_element_by_xpath(
            '//*[@id="subcontent"]/dl[%d]/dd[1]/a' % index).get_attribute('href')

        #writerId
        authorId_result = re.split('/', driver.find_element_by_xpath(
            '//*[@id="subcontent"]/dl[%d]/dd[1]/a' % index).get_attribute('href'))
        autohome_luntan_dict['author_id'] = authorId_result[3]

        # pub_date
        autohome_luntan_dict['pub_date'] = driver.find_element_by_xpath(
            '//*[@id="subcontent"]/dl[%d]/dd[1]/span' % index).text.strip()

        # 回复量
        autohome_luntan_dict['hf_count'] = driver.find_element_by_xpath(
            '//*[@id="subcontent"]/dl[%d]/dd[2]/span[1]' % index).text.strip()

        # 点击量
        autohome_luntan_dict['dj_count'] = driver.find_element_by_xpath(
            '//*[@id="subcontent"]/dl[%d]/dd[2]/span[2]' % index).text.strip()

        # 最后回复人
        autohome_luntan_dict['last_people'] = driver.find_element_by_xpath(
            '//*[@id="subcontent"]/dl[%d]/dd[3]/a' % index).text
        # 最后回复人链接
        autohome_luntan_dict['last_peopleUrl'] = driver.find_element_by_xpath(
            '//*[@id="subcontent"]/dl[%d]/dd[3]/a' % index).get_attribute('href')

        # 最后回复人id
        peopleId_result = re.split('/', driver.find_element_by_xpath(
            '//*[@id="subcontent"]/dl[%d]/dd[3]/a' % index).get_attribute('href'))
        autohome_luntan_dict['lastPeople_id'] = peopleId_result[3]

        # 最后回复时间
        autohome_luntan_dict['last_date'] = driver.find_element_by_xpath(
            '//*[@id="subcontent"]/dl[%d]/dd[3]/span' % index).text.strip()

        # 时间判断,取60天内数据且不取当天数据,当获得第61天数据返回,作为函数外结束页面循环的比较条件
        pub_date = driver.find_element_by_xpath(
            '//*[@id="subcontent"]/dl[%d]/dd[1]/span' % index).text.strip()
        before = format(datetime.datetime.now() + datetime.timedelta(days=-60), '%Y-%m-%d')
        if pub_date == datetime.datetime.now().strftime('%Y-%m-%d'):
            continue
        elif pub_date >= before:
            autohome_luntan_list.append(autohome_luntan_dict)
        else:
            return pub_date
    except Exception:
        continue

for cont in autohome_luntan_list:
    print(cont)
    write(cont)

写入数据库

def write(cont):
cont = cont
rowkey = datetime.datetime.now().strftime(’%Y%m%d’) + cont[‘title’] + cont[‘title_url’]
connection.open()
table = connection.table(table_name)
table.put(md5(rowkey.encode(‘utf-8’)).hexdigest(), {
“cf1:chexing_id”: cont[‘chexing_id’],
“cf1:title”: cont[‘title’],
“cf1:title_url”: cont[‘title_url’],
“cf1:title_urlid1”: cont[‘title_urlid1’],
“cf1:title_urlid2”: cont[‘title_urlid2’],
“cf1:author”: cont[‘author’],
“cf1:author_url”: cont[‘author_url’],
“cf1:author_id”: cont[‘author_id’],
“cf1:pub_date”: cont[‘pub_date’],
“cf1:hf_count”: cont[‘hf_count’],
“cf1:dj_count”: cont[‘dj_count’],
“cf1:last_people”: cont[‘last_people’],
“cf1:last_peopleUrl”: cont[‘last_peopleUrl’],
“cf1:lastPeople_id”: cont[‘lastPeople_id’],
“cf1:last_date”: cont[‘last_date’],
})
connection.close()

if name == ‘main’:

# 添加ip池
driver = get_ip()

# 本地ip
# driver = webdriver.Chrome()

# 车型文件
file = open(fp_path, 'r', encoding='utf-8')
wj_cont = file.readlines()
cx = []
for c in wj_cont:
    new_chexing = c.split(',')
    for nc in new_chexing:
        cx.append(nc)
# 获得车型
for chexing in cx:

    try:
        index = 1
        while True:
            time.sleep(1.5)

            url = "https://club.*******.com.cn/bbs/forum-c-%s-%d.html?orderby=dateline&qaType=-1" % (
                chexing, index)

            print("正在获取%s车型%d页数据" % (chexing, index))

            pub_date = get_cont(url, chexing)
            # 返回发布时间和前推第61天日期比较,满足条件结束循环
            break_time = format(datetime.datetime.now() + datetime.timedelta(days=-61), '%Y-%m-%d')
            if break_time == pub_date:
                break
            # 翻页
            index += 1


    except Exception:
        print("当前车型爬虫执行完毕")
        break

print("关闭驱动")
driver.quit()

`

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值