##!/usr/bin/env python
#-*- coding: utf8 -*-
from urllib2 import Request,urlopen,URLError
import urllib2
#import http.cookiejar
from BeautifulSoup import BeautifulSoup
#from mysql.common import MySQLCurd
import re
#import MySQLdb
import types
import sys
#import getContent
stdout = sys.stdout
stdin = sys.stdin
stderr = sys.stderr
reload (sys)
sys.stdout = stdout
sys.stdin = stdin
sys.stderr = stderr
sys.setdefaultencoding('utf-8')
def do_list():
# url = 'http://bbs.tianya.cn/list-16-1.shtml'
file_object = open('test.html')
try:
all_the_text = file_object.read( )
finally:
file_object.close( )
print all_the_text
#oper = urlopen(req)
#data = oper.read()
#print(data.decode())
datasoup = BeautifulSoup(all_the_text)
print "datasoup====="
print datasoup
#help(urllib2)
list_soup = datasoup.find('tbody').findAll('tr')
print "list_soup========"
print list_soup
#所有列表的tr
counter = 1
if len(list_soup) > 0:
for item in list_soup:
if counter == 1:
counter=counter+1
continue
# item.find('td', {'class': 'nx4'}).findAll('li')
item_td = item.findAll('td')
if len(item_td) > 0:
print "len---------------"
print (len(item_td))
index = 0
allListDict = dict()
isItem = 1
print "item_td=========================================="
print item_td
for td in item_td:
# print td
if index == 0: #cert
print "index=0 td===="
#print td
#print td.text
cert=td.text
#print "td.sub==================="
#print re.sub(r'\s*<td\s*.*>', "", td.text)
if index == 1: #vendor/CST
print "index=1 td===="
#print td #.text
#print td.text
vendor=td.text
#print "td.sub==================="
#print re.sub(r'\s*<td\s*.*>', "", td.text)
if index == 2: #Cryptographic Module
print "index=2 td===="
#print td
#print td.text
module=td.text
#print "td.sub==================="
#print re.sub(r'\s*<td\s*.*>', "", td.text)
if index == 3: #module
print "index=3 td===="
#print td
#print td.text
type=td.text
#print "td.sub==================="
#print re.sub(r'\s*<td\s*.*>', "", td.text)
if index == 4: #val
print "index=4 td===="
#print td
#print td.text
date=td.text
#print "td.sub==================="
#print re.sub(r'\s*<td\s*.*>', "", td.text)
if index == 5: #des
print "index=5 td===="
#print td
#print td.text
descr=td.text
#type(1)
#print "td.sub==================="
#print re.sub(r'\s*<td\s*.*>', "", td.text)
pos=descr.find('Overall Level:')
level=descr[pos:pos+15]
desc=descr[pos+15:]
pos=descr.find('Multi-Chip Stand Alone')
if pos==-1:
pos=descr.find('Multi-Chip embedded')
if pos==-1:
pos=descr.find('Single-chip')
if pos==-1:
none
else:
classtype='Single-chip'
else:
classtype='Multi-Chip embedded'
else:
classtype='Multi-Chip Stand Alone'
# if td.get('class') == 'td-title faceblue':
# xiaoshuo_type = '1'
# if td.get('class') == 'td-title facered':
# xiaoshuo_type = '2'
# # print 'type:%s' % xiaoshuo_type
# a_start = td.find('a')
# title = a_start.next
# c00000 = td.find('font', {'color':'c00000'})
# cred = td.find('font', {'color':'red'})
# if c00000 is None:
# if cred is None:
# title = str(a_start.next).strip()
# else:
# title = str(a_start).replace('<font color=red>', '').replace('</font>','').replace('<span class="art-ico art-ico-3">', '').replace('</span>','').replace('<b>','').replace('</b>','').replace('<span class="art-ico art-ico-5">', '')
# else:
# title = str(a_start).replace('</font>','').replace('<span class="art-ico art-ico-3">', '').replace('</span>','').replace('<b>','').replace('</b>','').replace('<span class="art-ico art-ico-5">', '')
#
#
# a_href = a_start['href']
# print 'title==%s' % title.strip()
# print title
# print type(title)
# #allListDict[]
# # print type(a_start)
# # print a_start.next
# # print type(title)
# HREF = a_href
# ID = HREF[9:-8]
# print 'ID==%s' % ID
# print ID.find('-')
#
# if ID.find('-') >= 0:
# isItem = 0
#
# # print 'href:%s, %s' % (a_href,title.encode('utf-8').strip())
# print type(title)
# print 'href:%s' % (title.encode('utf-8').strip())
#if index == 1: #href
# a_start = td.find('a')
# author = a_start.next
# a_href = a_start['href']
# #print type(a_href)
# print 'href:%s, %s' % (a_href, author.strip())
# A_HREF = a_href
#if index == 2: #hits
# xiaoshuo_hits = td.contents
# print 'hits:%s' % td.next
# HITS = str(td.contents[0])
# print type(HITS)
#
#if index == 3: #reply
# print 'reply:%s' % td.next
# REPLY = str(td.next)
# print type(REPLY)
#
#if index == 4: #time
# print 'time:%s' % td.get('title')
# TIME = td.get('title')
#
index=index+1
print "cert=%s" % cert
print "vendor=%s" % module
print "type=%s" % type
print "dae=%s" % date
print "level=%s" % level
print "classtype=%s" % classtype
print "desc=%s" % desc
# #sql = '''INSERT INTO list(name, `href`, `index`) VALUES (%s, %s, %s)'''
# #print 'sql===%s' % sql
# if isItem == 0:
# continue
#
# sql = 'select * from list where id=%d' % int(HREF[9:-8])
# data = m.fetchone(sql)
# print data
# print 'HREF=%s id=====%d' % (HREF, int(HREF[9:-8]))
# if data is None:
# sql = 'insert into list values(%d, "%s", "%s", "%s", "%s", "%s", 0, 0, 1,0, %d, %d, "", "", "")' % (int(HREF[9:-8]), title.strip(), xiaoshuo_type, HREF, author, A_HREF, int(HITS), int(REPLY))
# else:
# sql = 'update list set type=%s, retrivetime=%s, hits=%d, reply=%d, updatetime=%s where id=%d' % (xiaoshuo_type, '', int(HITS), int(REPLY), '', int(HREF[10:-8]))
# print 'sql===%s' % sql
# m.execute(sql)
#
#break
#div_nextpage = datasoup.find('div', {'class':'short-pages-2 clearfix'}) #查找下一页
#print div_nextpage
#if div_nextpage is None:
# return None, PAGE
#
#nexthref = div_nextpage.findAll('a')
#
#for a_nexthref in nexthref:
# descript = a_nexthref.next
# print descript
# if descript == '下一页':
# print a_nexthref
# nextpage = a_nexthref.get('href')
# print nextpage, PAGE+1
# return nextpage, PAGE+1
#
#return None, PAGE
do_list()
#-*- coding: utf8 -*-
from urllib2 import Request,urlopen,URLError
import urllib2
#import http.cookiejar
from BeautifulSoup import BeautifulSoup
#from mysql.common import MySQLCurd
import re
#import MySQLdb
import types
import sys
#import getContent
stdout = sys.stdout
stdin = sys.stdin
stderr = sys.stderr
reload (sys)
sys.stdout = stdout
sys.stdin = stdin
sys.stderr = stderr
sys.setdefaultencoding('utf-8')
def do_list():
# url = 'http://bbs.tianya.cn/list-16-1.shtml'
file_object = open('test.html')
try:
all_the_text = file_object.read( )
finally:
file_object.close( )
print all_the_text
#oper = urlopen(req)
#data = oper.read()
#print(data.decode())
datasoup = BeautifulSoup(all_the_text)
print "datasoup====="
print datasoup
#help(urllib2)
list_soup = datasoup.find('tbody').findAll('tr')
print "list_soup========"
print list_soup
#所有列表的tr
counter = 1
if len(list_soup) > 0:
for item in list_soup:
if counter == 1:
counter=counter+1
continue
# item.find('td', {'class': 'nx4'}).findAll('li')
item_td = item.findAll('td')
if len(item_td) > 0:
print "len---------------"
print (len(item_td))
index = 0
allListDict = dict()
isItem = 1
print "item_td=========================================="
print item_td
for td in item_td:
# print td
if index == 0: #cert
print "index=0 td===="
#print td
#print td.text
cert=td.text
#print "td.sub==================="
#print re.sub(r'\s*<td\s*.*>', "", td.text)
if index == 1: #vendor/CST
print "index=1 td===="
#print td #.text
#print td.text
vendor=td.text
#print "td.sub==================="
#print re.sub(r'\s*<td\s*.*>', "", td.text)
if index == 2: #Cryptographic Module
print "index=2 td===="
#print td
#print td.text
module=td.text
#print "td.sub==================="
#print re.sub(r'\s*<td\s*.*>', "", td.text)
if index == 3: #module
print "index=3 td===="
#print td
#print td.text
type=td.text
#print "td.sub==================="
#print re.sub(r'\s*<td\s*.*>', "", td.text)
if index == 4: #val
print "index=4 td===="
#print td
#print td.text
date=td.text
#print "td.sub==================="
#print re.sub(r'\s*<td\s*.*>', "", td.text)
if index == 5: #des
print "index=5 td===="
#print td
#print td.text
descr=td.text
#type(1)
#print "td.sub==================="
#print re.sub(r'\s*<td\s*.*>', "", td.text)
pos=descr.find('Overall Level:')
level=descr[pos:pos+15]
desc=descr[pos+15:]
pos=descr.find('Multi-Chip Stand Alone')
if pos==-1:
pos=descr.find('Multi-Chip embedded')
if pos==-1:
pos=descr.find('Single-chip')
if pos==-1:
none
else:
classtype='Single-chip'
else:
classtype='Multi-Chip embedded'
else:
classtype='Multi-Chip Stand Alone'
# if td.get('class') == 'td-title faceblue':
# xiaoshuo_type = '1'
# if td.get('class') == 'td-title facered':
# xiaoshuo_type = '2'
# # print 'type:%s' % xiaoshuo_type
# a_start = td.find('a')
# title = a_start.next
# c00000 = td.find('font', {'color':'c00000'})
# cred = td.find('font', {'color':'red'})
# if c00000 is None:
# if cred is None:
# title = str(a_start.next).strip()
# else:
# title = str(a_start).replace('<font color=red>', '').replace('</font>','').replace('<span class="art-ico art-ico-3">', '').replace('</span>','').replace('<b>','').replace('</b>','').replace('<span class="art-ico art-ico-5">', '')
# else:
# title = str(a_start).replace('</font>','').replace('<span class="art-ico art-ico-3">', '').replace('</span>','').replace('<b>','').replace('</b>','').replace('<span class="art-ico art-ico-5">', '')
#
#
# a_href = a_start['href']
# print 'title==%s' % title.strip()
# print title
# print type(title)
# #allListDict[]
# # print type(a_start)
# # print a_start.next
# # print type(title)
# HREF = a_href
# ID = HREF[9:-8]
# print 'ID==%s' % ID
# print ID.find('-')
#
# if ID.find('-') >= 0:
# isItem = 0
#
# # print 'href:%s, %s' % (a_href,title.encode('utf-8').strip())
# print type(title)
# print 'href:%s' % (title.encode('utf-8').strip())
#if index == 1: #href
# a_start = td.find('a')
# author = a_start.next
# a_href = a_start['href']
# #print type(a_href)
# print 'href:%s, %s' % (a_href, author.strip())
# A_HREF = a_href
#if index == 2: #hits
# xiaoshuo_hits = td.contents
# print 'hits:%s' % td.next
# HITS = str(td.contents[0])
# print type(HITS)
#
#if index == 3: #reply
# print 'reply:%s' % td.next
# REPLY = str(td.next)
# print type(REPLY)
#
#if index == 4: #time
# print 'time:%s' % td.get('title')
# TIME = td.get('title')
#
index=index+1
print "cert=%s" % cert
print "vendor=%s" % module
print "type=%s" % type
print "dae=%s" % date
print "level=%s" % level
print "classtype=%s" % classtype
print "desc=%s" % desc
# #sql = '''INSERT INTO list(name, `href`, `index`) VALUES (%s, %s, %s)'''
# #print 'sql===%s' % sql
# if isItem == 0:
# continue
#
# sql = 'select * from list where id=%d' % int(HREF[9:-8])
# data = m.fetchone(sql)
# print data
# print 'HREF=%s id=====%d' % (HREF, int(HREF[9:-8]))
# if data is None:
# sql = 'insert into list values(%d, "%s", "%s", "%s", "%s", "%s", 0, 0, 1,0, %d, %d, "", "", "")' % (int(HREF[9:-8]), title.strip(), xiaoshuo_type, HREF, author, A_HREF, int(HITS), int(REPLY))
# else:
# sql = 'update list set type=%s, retrivetime=%s, hits=%d, reply=%d, updatetime=%s where id=%d' % (xiaoshuo_type, '', int(HITS), int(REPLY), '', int(HREF[10:-8]))
# print 'sql===%s' % sql
# m.execute(sql)
#
#break
#div_nextpage = datasoup.find('div', {'class':'short-pages-2 clearfix'}) #查找下一页
#print div_nextpage
#if div_nextpage is None:
# return None, PAGE
#
#nexthref = div_nextpage.findAll('a')
#
#for a_nexthref in nexthref:
# descript = a_nexthref.next
# print descript
# if descript == '下一页':
# print a_nexthref
# nextpage = a_nexthref.get('href')
# print nextpage, PAGE+1
# return nextpage, PAGE+1
#
#return None, PAGE
do_list()