import web
urls=(
'/','index')
app=web.application(urls,globals())
class index:
def GET(self):
greeting="Hello World"
return greeting
if __name__== "__main__":
app.run()
import urllib
import urllib2
if __name__=='__main__':
#设置url和头部 发送请求
url="https://www.baidu.com/"
headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'}
values={'input':'3333@qq.com'}
data=urllib.urlencode(values)
request=urllib2.Request(url=url,data=data,headers=headers)
#接收响应
response=urllib2.urlopen(request)
print(response.read())
抓取网页内容事例:
import urllib2
import re
import os
if __name__=='__main__':
for i in range(1,1):
#抓取过程
#1.访问其中一个网页地址,获取网页源代码
url='http://www.qiushibaike.com/textnew/page/'+str(i)+'?s=4832451'
user_agent='Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'
headers={'User-Agent':user_agent}
try:
request=urllib2.Request(url=url,headers=headers)
response=urllib2.urlopen(request)
content=response.read()
except urllib2.HTTPError as e:
print ‘网络无法访问’
exit()
except urllib2.URLError as e:
print e
exit()
#2.根据抓取到的网页源代码去提取想要的数据
pattern=re.compile('<div class="content">(.*?)</div>',re.S)
items=re.findall(pattern,content)
for item in items:
#保存之前将网页的换行<br>转为\n
print item
item_new=item[1].replace('\n','').repalce('<br/>','\n')
#item[1]时间戳item[0]文本信息
#3.保存抓取的数据
path='qiubai'
if not os.path.exists(path):
os.makedirs(path)
file_path=path+'/'+item[1]+'.txt'
f=open(file_path,'w')
f.write(item_new)
f.close()
#4.抓取其他剩下的页面