html提取信息代码,简单的爬虫，从 html 中提取表格信息

最新推荐文章于 2022-12-06 02:23:01 发布

weixin_39753213

最新推荐文章于 2022-12-06 02:23:01 发布

阅读量130

点赞数

文章标签： html提取信息代码

#!/usr/bin/env python

#coding=utf8

try:

import os

import urllib

import pycurl

try:

from cStringIO import StringIO

except ImportError:

from StringIO import StringIO

from pyquery import PyQuery as pyq

import sys

reload(sys)

sys.setdefaultencoding('utf-8')

except ImportError:

print >> sys.stderr, """\

There was a problem importing one of the Python modules required.

The error leading to this problem was:

%s

Please install a package which provides this module, or

verify that the module is installed correctly.

It's possible that the above module doesn't match the current version of Python,

which is:

%s

""" % (sys.exc_info(), sys.version)

sys.exit(1)

__prog__ = "crawl"

__site__ = "http://www.oschina.net/code"

__version__ = "1.0"

class HttpRequest(object):

curl = None

def __init__(self):

self.url = None

self.url_para = None

self.curl = pycurl.Curl()

self.curl.setopt(pycurl.VERBOSE, 0)

self.curl.setopt(pycurl.USERAGENT, 'Miozilla/4.0 (compatible; MSIE 8.0; WindowsNT 6.1)')

self.curl.setopt(pycurl.HEADER, 1)

self.curl.setopt(pycurl.FOLLOWLOCATION, 1)

self.curl.setopt(pycurl.MAXREDIRS, 5)

self.curl.setopt(pycurl.COOKIEFILE, 'cookie.dat')

self.curl.setopt(pycurl.COOKIEJAR, 'cookie.dat')

self.curl.setopt(pycurl.HTTPGET, 1)

self.curl.setopt(pycurl.ENCODING, 'gzip,deflate')

self.curl.setopt(pycurl.CONNECTTIMEOUT, 60)

self.curl.setopt(pycurl.TIMEOUT, 300)

def set_url_para(self, para):

self.url_para = para

url = self.url + para

self.curl.setopt(pycurl.URL, url)

def set_post_para(self, para):

self.curl.setopt(pycurl.POST, 1)

self.curl.setopt(pycurl.POSTFIELDS, urllib.urlencode(para))

def set_cookie(self, cookie):

self.curl.setopt(pycurl.COOKIE, cookie)

def dry_write(self, buf):

pass

def download(self, url, file_path):

dir = os.path.dirname(file_path)

if not os.path.exists(dir):

os.makedirs(dir)

self.curl.setopt(pycurl.URL, url)

self.curl.setopt(pycurl.HEADER, False)

self.curl.setopt(pycurl.HEADERFUNCTION, self.dry_write) #忽略包头信息，否则会写入文件?!

with open(file_path, 'wb') as outfile:

self.curl.setopt(pycurl.WRITEFUNCTION, outfile.write)

try:

self.curl.perform()

except Exception, e:

self.curl.close()

def perform(self, url, referer=''):

assert url, 'url is null!'

self.curl.setopt(pycurl.URL, url)

referer and self.curl.setopt(pycurl.REFERER, referer)

self.buf = StringIO()

self.head = StringIO()

self.curl.setopt(pycurl.WRITEFUNCTION, self.buf.write)

self.curl.setopt(pycurl.HEADERFUNCTION, self.head.write)

try:

self.curl.perform()

self.r = self.buf.getvalue()

self.h = self.head.getvalue()

self.code = self.curl.getinfo(pycurl.HTTP_CODE)

self.info = self.curl.getinfo(pycurl.EFFECTIVE_URL)

self.cookie = self.curl.getinfo(pycurl.INFO_COOKIELIST)

self.curl.setopt(pycurl.REFERER, self.info) #AUTO REFERER

except Exception, e:

self.curl.close()

self.buf.close()

self.head.close()

def __del__(self):

self.curl.close()

def get_body(self):

return self.r

def get_head(self):

return self.h

def get_code(self):

return self.code

def get_info(self):

return self.info

def get_cookie(self):

return self.cookie

if __name__ == '__main__':

asp_range = xrange(1, 10)

page_range = xrange(1, 10)

crawl = HttpRequest()

for i in asp_range:

for j in page_range:

url = 'http://www.nbbicycle.com/html/116/s%d.asp?i=1&page=%d' % (i, j)

try:

crawl.perform(url)

doc = pyq(crawl.get_body())

content = doc('.contd')

print content.children('div').eq(0).text()

for tr in content.items('tr'):

print tr.text()

except Exception, e:

print e

weixin_39753213

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
html提取信息代码,简单的爬虫，从 html 中提取表格信息

#!/usr/bin/env python#coding=utf8try:import osimport urllibimport pycurltry:from cStringIO import StringIOexcept ImportError:from StringIO import StringIOfrom pyquery import PyQuery as pyqimport sysre...
复制链接

扫一扫

评论

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。