get_all_url

# -*- coding: utf-8 -*-
"""
 @Time   : 2020/10/29 13:47 
 @Athor   : LinXiao
 @功能   :
"""
# ------------------------------
# 获取每个城市的总页数
import io
import sys
import time
from pprint import pprint

import lxml
import requests
from bs4 import BeautifulSoup
from loguru import logger
from lxml import etree
import random

from redis import Redis
from requests.exceptions import ProxyError


from spider.alifapai_pc import broswer_head, pagination, broswer_head_city_frist
from tools.city_name import city_to_gb2312, hanzi_to_pinyin

# sys.stdout=io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030')

redis_url=Redis(db=10)

CITY_NAMES=['成都', '西安', '重庆', '武汉', '青岛', '广州', '长沙', '兰州']


# 获取总共的页数
def get_page_total(first_page):
    time.sleep(random.random())
    ip, headers, payload, files, proxies=broswer_head_city_frist()
    time.sleep(random.random())
    # time.sleep(random.randint(2, 6))
    sleeptime=random.randint(15, 35)
    time.sleep(sleeptime)
    print(f'随机休眠 {sleeptime}s')
    print('开始请求页面.......')
    try:
        try:
            start=time.time()
            response=requests.request("GET", first_page, headers=headers, data=payload, files=files, proxies=proxies,
                                  timeout=40)
            pprint(response.text)
            end=time.time()
            print(f'页面请求 Sucess! 用时{end - start}S')
            try:
                # 转码
                html_content=response.text.encode(response.encoding).decode("gbk")
                # 开始xpath解析
                tree_html=etree.HTML(html_content)
                page_total_str=(tree_html.xpath('//*[@id="J_LimitFixed"]/ul/li[1]/em/text()'))[0]
                # except Exception as e:
                #     print('解析不到页面数量!')
    
                # page_total_str=(tree_html.xpath('/html/body/div[3]/div[4]/span/em/text()'))[0]
                items_count=int(page_total_str)
                if items_count <= 40:
                    page_total=1
                else:
                    page_total=items_count // 40 + 1
                print(f'page_tatol is: {page_total}')
                return page_total
            except Exception as e:
                print('解析错误!')
                        
        except Exception as e:
            logger.error(f'请求出错!....{e} ,删除不可用ip!')
            redis_ip=Redis(db=8)
            redis_ip.lrem("proxy_ip", 0, ip)  # 移除表中所有与 value 相等的值
            


    # soup = BeautifulSoup(html_content,'lxml')
    # page_total_str = str(soup.select('em[class="page-total"]'))   # 返回 <em class="page-total">2</em>
    # page_total = int(page_total_str.split('</em>')[0].split('total">')[1])

    # page_total_str=(tree_html.xpath('//*[@class="page-skip"]/em/text()'))# '16'  list index out of range
    # try:
    # page_total_str1=(tree_html.xpath('//*[@id="J_LimitFixed"]/ul/li[1]/em/text()'))
    # print(page_total_str1)
    page_total_str=(tree_html.xpath('//*[@id="J_LimitFixed"]/ul/li[1]/em/text()'))[0]
    # except Exception as e:
    #     print('解析不到页面数量!')

    # page_total_str=(tree_html.xpath('/html/body/div[3]/div[4]/span/em/text()'))[0]
    items_count=int(page_total_str)
    if items_count <= 40:
        page_total = 1
    else:
        page_total = items_count//40 + 1
    print(f'page_tatol is: {page_total}')
    return page_total





# 构造所有page页的url
def get_all_page(page_tatol, cityname):
    for page_num in range(1, int(page_tatol + 1)):  # 32页就是2020年8月29号   (只要 九月份以前的数据)
        parm=pagination()  # spm=a213w.7398504.pagination.8.6NzcEktGwdiVP0
        # pre_url = "https://sf.taobao.com/item_list.htm?spm=a213w.7398504.pagination.7.14f14cc6QQLvCs&category=50025969&auction_source=0&city=%B3%C9%B6%BC&st_param=-1&auction_start_seg=-1&page=9"
        pre_url="https://sf.taobao.com/item_list.htm?"
        city_pinyin=hanzi_to_pinyin(cityname)

        # sorder = (x for x in range(0,2))
        city_code=city_to_gb2312(cityname)
        suffix=f"&category=50025969&auction_source=0&city={city_code}&st_param=-1&sorder=0&auction_start_seg=-1&page={page_num}"

        url=pre_url + parm + suffix
        print(url)
        redis_url.lpush(str(city_pinyin), url)
    logger.info(f"已经获取并保存 {cityname} 全部url")


if __name__ == '__main__':
    # first_page="https://sf.taobao.com/item_list.htm?spm=a213w.7398504.pagination.1.6e464cc6ZhiDi4&category=50025969&auction_source=0&city=%B3%C9%B6%BC&sorder=4&st_param=-1&auction_start_seg=-1&page=2"
    # first_page="https://sf.taobao.com/item_list.htm?spm=a213w.7398504.filter.105.501c4cc6MHjcIg&category=50025969&auction_source=0&item_biz_type=6&city=%B3%C9%B6%BC&sorder=1&st_param=-1&auction_start_seg=-1"
    # first_page = "https://sf.taobao.com/item_list.htm?spm=a213w.7398504.filter.104.e3954cc6Ph5abU&category=50025969&auction_source=0&city=%B3%C9%B6%BC&sorder=0&st_param=-1&auction_start_seg=-1"
    # get_page_total(first_page)


    pre_url="https://sf.taobao.com/item_list.htm?"
    for cityname in CITY_NAMES:
        print(cityname)

        city_code=city_to_gb2312(cityname)
        parm=pagination()  # spm=a213w.7398504.pagination.8.6NzcEktGwdiVP0

        suffix=f'&category=50025969&auction_source=0&city={city_code}&sorder=0&st_param=-1&auction_start_seg=-1&page=1'
        url=pre_url + parm + suffix
        print(url)
        page_total=get_page_total(url)   # 返回正在进行的每个城市的总共多少页
        print(f'{cityname} 正在进行 拍卖的房源总共有{page_total} 页')

        # 生成每个城市的正在进行的所有的项目的url

        # get_all_page(page_tatol, cityname)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值