html提取信息代码,简单的爬虫,从 html 中提取表格信息

#!/usr/bin/env python

#coding=utf8

try:

import os

import urllib

import pycurl

try:

from cStringIO import StringIO

except ImportError:

from StringIO import StringIO

from pyquery import PyQuery as pyq

import sys

reload(sys)

sys.setdefaultencoding('utf-8')

except ImportError:

print >> sys.stderr, """\

There was a problem importing one of the Python modules required.

The error leading to this problem was:

%s

Please install a package which provides this module, or

verify that the module is installed correctly.

It's possible that the above module doesn't match the current version of Python,

which is:

%s

""" % (sys.exc_info(), sys.version)

sys.exit(1)

__prog__ = "crawl"

__site__ = "http://www.oschina.net/code"

__version__ = "1.0"

class HttpRequest(object):

curl = None

def __init__(self):

self.url = None

self.url_para = None

self.curl = pycurl.Curl()

self.curl.setopt(pycurl.VERBOSE, 0)

self.curl.setopt(pycurl.USERAGENT, 'Miozilla/4.0 (compatible; MSIE 8.0; WindowsNT 6.1)')

self.curl.setopt(pycurl.HEADER, 1)

self.curl.setopt(pycurl.FOLLOWLOCATION, 1)

self.curl.setopt(pycurl.MAXREDIRS, 5)

self.curl.setopt(pycurl.COOKIEFILE, 'cookie.dat')

self.curl.setopt(pycurl.COOKIEJAR, 'cookie.dat')

self.curl.setopt(pycurl.HTTPGET, 1)

self.curl.setopt(pycurl.ENCODING, 'gzip,deflate')

self.curl.setopt(pycurl.CONNECTTIMEOUT, 60)

self.curl.setopt(pycurl.TIMEOUT, 300)

def set_url_para(self, para):

self.url_para = para

url = self.url + para

self.curl.setopt(pycurl.URL, url)

def set_post_para(self, para):

self.curl.setopt(pycurl.POST, 1)

self.curl.setopt(pycurl.POSTFIELDS, urllib.urlencode(para))

def set_cookie(self, cookie):

self.curl.setopt(pycurl.COOKIE, cookie)

def dry_write(self, buf):

pass

def download(self, url, file_path):

dir = os.path.dirname(file_path)

if not os.path.exists(dir):

os.makedirs(dir)

self.curl.setopt(pycurl.URL, url)

self.curl.setopt(pycurl.HEADER, False)

self.curl.setopt(pycurl.HEADERFUNCTION, self.dry_write) #忽略包头信息,否则会写入文件?!

with open(file_path, 'wb') as outfile:

self.curl.setopt(pycurl.WRITEFUNCTION, outfile.write)

try:

self.curl.perform()

except Exception, e:

self.curl.close()

def perform(self, url, referer=''):

assert url, 'url is null!'

self.curl.setopt(pycurl.URL, url)

referer and self.curl.setopt(pycurl.REFERER, referer)

self.buf = StringIO()

self.head = StringIO()

self.curl.setopt(pycurl.WRITEFUNCTION, self.buf.write)

self.curl.setopt(pycurl.HEADERFUNCTION, self.head.write)

try:

self.curl.perform()

self.r = self.buf.getvalue()

self.h = self.head.getvalue()

self.code = self.curl.getinfo(pycurl.HTTP_CODE)

self.info = self.curl.getinfo(pycurl.EFFECTIVE_URL)

self.cookie = self.curl.getinfo(pycurl.INFO_COOKIELIST)

self.curl.setopt(pycurl.REFERER, self.info) #AUTO REFERER

except Exception, e:

self.curl.close()

self.buf.close()

self.head.close()

def __del__(self):

self.curl.close()

def get_body(self):

return self.r

def get_head(self):

return self.h

def get_code(self):

return self.code

def get_info(self):

return self.info

def get_cookie(self):

return self.cookie

if __name__ == '__main__':

asp_range = xrange(1, 10)

page_range = xrange(1, 10)

crawl = HttpRequest()

for i in asp_range:

for j in page_range:

url = 'http://www.nbbicycle.com/html/116/s%d.asp?i=1&page=%d' % (i, j)

try:

crawl.perform(url)

doc = pyq(crawl.get_body())

content = doc('.contd')

print content.children('div').eq(0).text()

for tr in content.items('tr'):

print tr.text()

except Exception, e:

print e

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值