import urllib2
import socket
def spider(self,url,headers,tryTime=3):
try:
req = urllib2.Request(url=url,headers=headers)
page = urllib2.urlopen(req,timeout=30)
except urllib2.URLError,e:
if hasattr(e,'reason'):
#判断是否为超时,若是则重试,且不超过3次
if isinstance(e.reason,socket.timeout)
if retries>0:
return self.spider(self,url,headers,tryTime-1)
return None
#HTTPError实例产生后会有整型的code属性
elif hasattr(e,'code'):
#重定向处理
#if e.code==301 or e.code==302:
#Python实现了重定向请求函数,自动跳转
return None
else:
html = page.read()
#先解码成无编码格式(参数为原编码格式)
html = html.decode('gbk')
#再编码成指定格式
html= html.encode('utf8')
#若使用BeautifulSoup模块则增加一个参数,会将其转换成Unicode编码
#newHtml = BeautifulSoup(str(html),fromEncoding='gbk')
import re
reg = '<h3>.*<h3>'
content = re.findall(reg,html,re.I)
for item in content:
#do something
import BeautifulSoup
soup = BeautifulSoup(str(html))
#获取class属性的值为page或者a1的全部标签内的文本内容,即展示在页面上的
pageInfo = soup.find(attrs={'class':'page'}).find(attrs={'class':'a1'}).get_text()
#获取<a>标签中href属性的值
hrefInfo = soup.a['href']
#抽取新闻文本正文,存放在id值为'art_cont'的标签文本中
#每个段落在一对<p></p>标签文本中
contents = soup.find(attrs={'id':'art_cont'}).findAll('p')
text = ''
for content in contents:
content = content.get_text().encode('utf-8').strip()
text += content+'\n'
text = text.strip()
import threading
#修改run()函数的加锁机制:
class Test(threading.Thread):
def __init__(self, num):
threading.Thread.__init__(self)
self._run_num = num
def run(self):
global count, mutex
threadname = threading.currentThread().getName()
for x in xrange(0, int(self._run_num)):
mutex.acquire() #对count变量加锁,获得权限
count = count + 1
mutex.release() #对count变量解锁,释放权限
print threadname, x, count
time.sleep(1)
if __name__ == '__main__':
global count, mutex
threads = []
num = 4
count = 1
# 创建锁
mutex = threading.Lock()
for x in xrange(0, num):
threads.append(Test(10)) # 创建4个线程对象
for t in threads:
t.start() # 启动线程
for t in threads: # 设置主线程必须等待
t.join() # 所有子线程结束后才能结束
import MySQLdb
sql = "SELECT * FROM Users WHERE Value= " + a_variable + "..."
a_variable = "'SA001’;drop table c_order--"
#若直接拼接,产生的SQL语句如下
#执行了意外的表删除操作,而后面的语句被当成注释被忽略了
#sql = "SELECT * FROM Users WHERE Value= 'SA001’;drop table c_order--..."
#利用MySQLdb模块自带的方法进行语义转义
MySQLdb.escape_string(a_variable)
#转义之后
#a_variable = "\'SA001\’\;drop table c_order\-\-"