爬取58同城房产数据简单例子:
import requests import re import urllib.request import time #url = 'http://bj.58.com/ershoufang/?PGTID=0d200001-0000-177a-59fd-ff451a7a6078&ClickID=1' url = 'http://jq.58.com/ershoufang/?PGTID=0d200001-0289-326f-83b1-136a8a11d60c&ClickID=1' def get_html(url): r = requests.get(url) # 最基本的GET请求 #print(r.status_code) # 获取返回状态 #print(r.url) #print(r.text) # 打印解码后的返回数据 return r.text def get_content(html): pattern = re.compile(r'<p class="bthead">[\s\S]*?<a href="(.*?)" target="_blank" class="t" ') subUrls = pattern.findall(html) i = 0 for subUrl in subUrls: i += 1 print("房产链接%d : %s" % (i, subUrl)) #break return subUrls def getHouseInfo(subUrls): i = 0 for subUrl in subUrls: subPage = urllib.request.urlopen(subUrl) subHtml = subPage.read().decode('utf8') i += 1 #print(subHtml) #break # http://jq.58.com/ershoufang/29531455158954x.shtml?psid=175499680195669470620619228&entinfo=29531455158954_0 # 售价:面议万元(元/㎡) #pattern = re.compile(r'售价:(\d[\d.]+)万元((\d{1,6})元/㎡)[\s\S]*?<h1 class="c_333 f20">(.*?)</h1>[\s\S]*?<span class=\'up\' id=\'\'>(.*?)</span>[\s\S]*?<p class=\'phone-num-2\'>(\d{1,11})</p>') pattern = re.compile( r'售价:(.*?)万元((.*?)元/㎡)[\s\S]*?<h1 class="c_333 f20">(.*?)</h1>[\s\S]*?<span class=\'up\' id=\'\'>(.*?)</span>[\s\S]*?<p class=\'phone-num-2\'>(\d{1,11})</p>') Info = pattern.findall(subHtml) time.sleep(0.2) print('%d售价:%s万元 单价:%s元/㎡ 标题;%s 更新时间:%s 联系电话:%s' % (i,Info[0][0],Info[0][1],Info[0][2],Info[0][3],Info[0][4])) html = get_html(url) subUrls = get_content(html) getHouseInfo(subUrls)