代码块
代码块语法遵循标准markdown代码,例如:
@requires_authorization
#setup.py build
#setup.py intall
from bs4 import BeautifulSoup
bs = BeautifulSoup
doc = [
'<html><head><title>Page title</title></head>',
'<body><p id="firstpara" align="center">This is paragraph <b>one</b>.',
'<p id="secondpara" align="blah">This is paragraph <b>two</b>.',
'</html>'
]
soup = bs(''.join(doc))
#---
import re
import urllib
def getHtml(url):
page = urllib.urlopen(url,proxies={'http': 'http://192.168.1.2:3128'})
html = page.read()
return html
def getImg(html):
reg = r'src="(.+?\.jpg)" pic_ext'
imgre = re.compile(reg)
imglist = re.findall(imgre, html)
return imglist
def imgDownload(imglist):
x = 0
for imgurl in imglist:
urllib.urlretrieve(imgurl, '%s.jpg' % x)
x+=1
print '第', x, '张图片下载完成'
html = getHtml('http://tieba.baidu.com/p/2460150866')
print getImg(html)
#---
def getItemNum(url):
# 功能:获取一个卖家的所有商品数目
# 输入: 一个卖家的任意商品列表页面
# 输出: 卖家的所有商品数目
raw = getHtml(url)
p = re.compile(r'<span class="rcnt"\s{0,}>(.*)?</span>')
tmpNum = re.findall(p,raw)
return tmpNum
starttime = datetime.datetime.now()
tmp = getItemNum(url)
itemNum = int(tmp[0].replace(',', ''))
endtime =datetime.datetime.now()
print (endtime-starttime).seconds
#===
html = getHtml(url)
soup = bs(html)
soup.find(id='descItemNumber').string
#----
fid = open('fds.txt', 'r')
lines = fid.readlines() # 带/n
def getAddress(itemID):
try:
tmpUrl = 'http://www.ebay.com/itm/' + itemID
html = getHtml(tmpUrl)
soup = bs(html)
xx = soup.select('div.iti-eu-bld-gry ')
addr = bs(''.join(xx[0]))
return addr
except Exception, ex:
print ex
print '没有找到地址'
return None