爬取完会自动生成csv电子表格文件,含有房价、押付、链接等信息
环境
py2.7
pip install lxml
pip install cssselect
1 #coding:utf-8 2 import csv 3 import urllib2 4 import lxml.html 5 import time 6 import sys 7 from lxml.cssselect import CSSSelector 8 import threading 9 reload(sys) 10 sys.setdefaultencoding('utf8') 11 12 print "请输入要爬取得城市简称例如bj(北京):" 13 CITY=str(raw_input(">>>")) 14 def download(url, user_agent='Google', num_retries=2): 15 16 headers = {'User-agent': user_agent} 17 request = urllib2.Request(url, headers=headers) 18 try: 19 html = urllib2.urlopen(request).read() 20 except urllib2.URLError as e: 21 html = None 22 if num_retries > 0: 23 if hasattr(e, 'code') and 500 <= e.code < 600: 24 return download(url, num_retries-1) 25 return html 26 27 28 def get_data(url): 29 html_text_detail = download(url) 30 try: 31 tree = lxml.html.fromstring(html_text_detail) 32 house_ext = CSSSelector('div.house-pay-way > span:nth-child(3)') 33 house_title = CSSSelector('div.main-wrap > div.house-title > h1') 34 house_pay_way1 = CSSSelector('div.house-pay-way > span:nth-child(1)') 35 house_pay_way2 = CSSSelector('div.house-pay-way > span:nth-child(2)') 36 print house_title(tree)[0].text_content() 37 print '%s|%s' % (house_pay_way1(tree)[0].text_content(), house_pay_way2(tree)[0].text_content()) 38 39 for i in range(7): 40 for j in range(2): 41 css = 'div.house-desc-item > ul.f14 > li:nth-child(%s) > span:nth-child(%s)' % (i+1, j+1) 42 house_info = CSSSelector(css) 43 data = [ 44 ('标题 : ',house_title(tree)[0].text_content(), '#',url), 45 ('价格: ',house_pay_way1(tree)[0].text_content(), '#'), 46 ('压付: ',house_pay_way2(tree)[0].text_content(), '#'), 47 ('详情: ',house_info(tree)[0].text_content().replace(' ', ''), '#')] 48 with open('%s_houses.csv'%CITY,'ab+') as csvfile: 49 writer = csv.writer(csvfile,lineterminator='\n') 50 writer.writerows(data) 51 52 except TypeError as e: 53 pass 54 except IndexError as e: 55 pass 56 57 def get_url(html): 58 tree = lxml.html.fromstring(html) 59 sel = CSSSelector('div.mainbox > div.main > div.content > div.listBox > ul.listUl > li > div.des > h2 > a') 60 url_list = [] 61 for i in sel(tree): 62 if i.get('href') not in url_list: 63 url_list.append(i.get('href')) 64 return url_list 65 66 67 if __name__ == '__main__': 68 url_index = 'http://%s.58.com/chuzu/'%CITY 69 html_text_list = download(url_index) 70 url_list = get_url(html_text_list) 71 72 for url_detail in url_list: 73 thr = threading.Thread(target=get_data, args=(url_detail,)) 74 thr.start() 75 76 time.sleep(0.001)