XPath爬取百度搜索结果

webpy + nginx框架

主要涉及到lxml中的xpath模块解析html格式数据

各种编码问题

部分XPath实现:

<def parse_baidu(self, body):

		print("parse_baidu ===>>")

		elements = []
		try:
			html = body
			page = etree.HTML(html.lower().decode('utf-8')) # need convert to utf-8
			
			tags = page.xpath(u"//div[@class='result c-result'] | //div[@class='result c-result c-clk-recommend']")
			for tag in tags:

				#get link
				data_log_attrib = tag.attrib['data-log']
				if data_log_attrib:
					data_log_attrib = data_log_attrib.replace('\'', '"')
					data_log = json.loads(data_log_attrib)
					if data_log:
						node = {}
						node['order'] = data_log['order']
						node['link'] = data_log['mu']

						# get title
						_tag = tag.xpath(u"./div/a/h3")
						if len(_tag):
							_tag_str = _tag[0].xpath("string(.)")
							node['title'] = _tag_str.encode('utf-8')
						else:
							node['title'] = "Unknown"

						# get description
						_tag = tag.xpath(u"./div/div")
						if len(_tag):
							_tag_str = _tag[0].xpath("string(.)")
							_string = _tag_str.encode("utf-8")
							node['desc'] = _string
						else:
							node['desc'] = "Unknown"

						elements.append(node)
			print("parse_baidu <<===")
			return elements
				
		except Exception as e:
			print("parse_baidu failed {}".format(e))
		print("parse_baidu <<=== end")
		return None
def query_baidu(self, keyword):

		print("query_baidu ===>>")

		try:
			_keyword = urllib.quote(keyword.encode('utf-8'))
			url_query = "https://m.baidu.com/from=1014284b/s?word=" + _keyword + "&sa=tb&ts=6902153&t_kt=0&ie=utf-8&rsv_t=9e926S4zLuzG32Q2kkM5Tu%252Bc%252B4TbHKAg9WiWPQfnflJUbt8%252BiCpIrXI%252FyApB%252FeM&rsv_pq=17372243552828969370&ss=111&rsv_sug4=14106&inputT=12708&oq=%E4%B9%A0%E8%BF%91%E5%B9%B3"
			
			request = urllib2.Request(url_query)
			response = urllib2.urlopen(request)
			body = response.read()

			#
			# TODO: parse the body content, extract items (some links)
			#
			elements = self.parse_baidu(body)
			return elements
			
		except Exception as e:
			 print("Query Baidu Error: {}".format(e))

		return None

	def query_google(self, keyword):

		return "Not implemented!"

	def GET(self):
		response = None
		arguments = {}
		try:
			key_word = web.input(keyword = "demo_keyword")
			arguments['keyword'] = key_word.keyword
			arguments['extern_ip'] = "{}:{}".format(web.ctx.ip, web.ctx.env["REMOTE_PORT"])
			elements = self.query_baidu(key_word.keyword)
			response = self.render.template_index(arguments, elements)
		except Exception as e:
			response = "GET Error: {}".format(e)
		return response

	def POST(self):
		pass


 

html模版页面部分实现:

<div id="result"> 
		$if elements:
			$for e in elements:
			<h3>$e['order']</h3>
				$if e['link'] and e['title']:
				<li><a href="$e['link']">$e['title']</a></li>
				$if e['desc']:
					<li>$e['desc']</li>
		$else:
			<h3>Not get values!</h3>
<p>
</div>


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值