爬虫练习

# -*- coding: utf-8 -*-
import sys
import threading
import threadpool

reload(sys)
sys.setdefaultencoding('utf-8')
import time
from selenium import webdriver

webdriverPath = 'D:\\test\\chromedriver.exe'
savePath = 'D:\\test'


class GetName(object):
    def getName(self):
        options = webdriver.ChromeOptions()
        options.add_argument("--user-data-dir=" + r"C:/Users/cui/AppData/Local/Google/Chrome/User Data/")
        driver = webdriver.Chrome(webdriverPath)
        try:
            allCityUrlList = []
            driver.get('http://www.58.com/ershoufang/changecity/')
            time.sleep(2)
            cityPage = driver.find_element_by_css_selector('#clist')
            cityList = cityPage.find_elements_by_tag_name('dd')
            print len(cityList)
            for city in cityList:
                cityUrlList = city.find_elements_by_tag_name('a')
                len(cityUrlList)
                for cityUrl in cityUrlList:
                    url = cityUrl.get_attribute('href')
                    cityName = cityUrl.text
                    city = {"cityName": cityName, "cityUrl": url}
                    print city
                    allCityUrlList.append(city)
            print len(allCityUrlList)
            # for allCityUrl in allCityUrlList:
            #     self.crawler(allCityUrl, driver)
            pool = threadpool.ThreadPool(3)
            requests = threadpool.makeRequests(self.crawler, allCityUrlList)
            [pool.putRequest(req) for req in requests]
            pool.wait()
        except:
            print "crawler name error !"
        finally:
            driver.close()

    def getUrlPage(self, driver, allCityUrl):
        page = driver.find_element_by_css_selector(
            'body > div.main-wrap > div.content-wrap > div.content-side-left > ul')
        infoList = page.find_elements_by_css_selector('li')
        len(infoList)

        resultList = []
        for info in infoList:
            addressNameInfo = info.find_element_by_class_name('list-info').text
            priceInfo = info.find_element_by_class_name('price').text
            addressNameTemp = addressNameInfo.split("\n")
            priceTemp = priceInfo.split("\n")
            addressTemp = addressNameTemp[2].split("")
            name = addressNameTemp[3].split(" ")[1]
            result = name + ' ' + allCityUrl.get('cityName') + ' ' + addressTemp[1] + ' ' + addressTemp[0] + ' ' + \
                     addressTemp[2] + ' ' + addressNameTemp[1] + ' ' + priceTemp[0] + ' ' + priceTemp[1]
            resultList.append(result)
            print result
            self.writeFile(result, allCityUrl.get('cityName'))
        return resultList

    def writeFile(self, content, cityName):
        file_object = open(savePath + '\\' + cityName + '.txt', 'a+')
        file_object.write(content)
        file_object.write('\n')
        file_object.close()

    def crawler(self, allCityUrl):
        options = webdriver.ChromeOptions()
        options.add_argument("--user-data-dir=" + r"C:/Users/cui/AppData/Local/Google/Chrome/User Data/")
        driver = webdriver.Chrome(webdriverPath)
        try:

            driver.get(allCityUrl.get("cityUrl"))
            url = ""
            nextPage = ""
            try:
                nextPage = driver.find_element_by_css_selector(
                    'body > div.main-wrap > div.content-wrap > div.content-side-left > div.pager > a:nth-child(4)').text
                url = driver.find_element_by_css_selector(
                    'body > div.main-wrap > div.content-wrap > div.content-side-left > div.pager > a:nth-child(4)').get_attribute(
                    'href')
            except:
                print "get pageNum error ! "
            print type(nextPage)
            print type(int(nextPage))
            print url
            for i in range(1, int(nextPage)):
                try:
                    url = url.split("/pn")[0]
                    url = url + "/pn" + str(i) + "/"
                    print url
                    driver.get(url)
                    time.sleep(1)
                    urlList = self.getUrlPage(driver, allCityUrl)
                    print len(urlList)
                except:
                    print "get pageInfo error ! "
        except:
            print "one pool error !"
        finally:
            driver.close()

if __name__ == "__main__":
    test = GetName()
    test.getName()
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值