#-*- coding: utf-8 -*-# # #from selenium importwebdriverfrom selenium.webdriver.common.desired_capabilities importDesiredCapabilitiesimporturllib2importsysimportpymysqlfrom bs4 importBeautifulSoupimportrandom
reload(sys)
sys.setdefaultencoding('utf-8')#连接数据库
conn = pymysql.Connection(host="localhost", user="root", passwd="root", db='test',charset="UTF8")#创建指针
cursor =conn.cursor()
dcap=dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.loadImages"] =False
phantomjs_driver_path= 'E:\\p_python\\Scripts\\phantomjs\\bin\\phantomjs.exe'
#####创建表格函数 db_cp
defready():try:####创建表格
cursor.execute('select number from db_cp')#获取查询结果
row =cursor.fetchall()#没有设置默认自动提交,需要主动提交,以保存所执行的语句
conn.commit()except:####报错说明表不存在
#开始创建表
#sql = 'CREATE TABLE 表名称(id int,number varchar)'
cursor.execute('CREATE TABLE db_cp(id int,number varchar(255))')
conn.commit()#########
definsert_tb(list):try:
effect_row= cursor.executemany("insert into db_cp(number)values(%s)", list)## 提交,不然无法保存新建或者修改的数据
conn.commit()except:print 'Add this db fault!'
###定义分割线
defline():print('-'*80)print('Strating...............')#####html
defget_html(url):'''获取html'''
##定义headers
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"headers={"User-Agent":user_agent}
request= urllib2.Request(url, headers=headers)#request.encoding = 'utf-8'
try:
html=urllib2.urlopen(request).read()excepturllib2.URLError as e:print url+'Download error:', e.reason
html=Nonereturnhtml######定义Phantomjs函数
defget_bt(url):print 'starting get data in'+url+'\n'driver= webdriver.PhantomJS(phantomjs_driver_path, desired_capabilities=dcap)#开始获取里面的内容
driver.get(url)
new_span_list= driver.find_element_by_id('redsId').find_elements_by_tag_name('span')###开始获取编号
number = ''list=[]for v innew_span_list:printv.text
list.append(v.text)
number="-".join(list)
white_ball= driver.find_element_by_id('blueId')
number+='-'+str(white_ball.text)returnnumber####创建主函数
defmain():
line()#ready()
for v in range(1,2):if v==1:
url= 'http://www.zhcw.com/ssq/kjgg/'
else:
url= 'http://www.zhcw.com/ssq/kjgg/index_'+str(v)+'.shtml'html=get_html(url)
obj= BeautifulSoup(html, 'html.parser')
span_list= obj.find_all('span', {'class':'Nlink'})
number_list=[]for span inspan_list:
href= span.find('a')['href']print('-'*80)#print(href)
new_href = 'http://www.zhcw.com'+href
number=get_bt(new_href)
number_list.append((number))printnumber
insert(number_list)deftest():
html= get_html('http://www.foods1.com/TurnImg/mobile')printhtml###数据库操作####开始主函数
if __name__ == '__main__':
main()