一:爬虫的常规方法
爬虫的常用套路是table-tr(行)-th/td(元素)
'''
Created on Feb 28, 2017
@author: hcq908
'''
import csv
import os
# import re
from urllib.request import urlopen
from bs4 import BeautifulSoup
if __name__ == '__main__':
iCntTable = 0;
html = urlopen("https://en.wikipedia.org/wiki/Comparison_of_text_editors")
#html = urlopen("http://www.shfe.com.cn/bourseService/businessdata/summaryinquiry/index.html?paramid=trading_daily")
bsObj = BeautifulSoup(html, "html.parser")
oTables = bsObj.find_all("table")#选定第一个表格
for table in oTables:
iCntTable =iCntTable + 1;
print('处理第%d个表格 \n'%iCntTable)
#获取表格名称
#sTitleTag = table.find('caption');#标题只有一个,注意有的没有标题等
#print(sTitleTag)
# sMatchText = re.compile(r'<[^>]+>', re.S)
# sTextRemain = sMatchText.sub('', sTitleTag)
sTitleName= chr(iCntTable)+'.csv';
#路劲不存在是需要新建
sDir = './files';
if not os.path.exists(s