1、 出错
ImportError: No module named setuptools
wget http://peak.telecommunity.com/dist/ez_setup.py
python ez_setup.py
2、皮尔逊相关度计算
def pearson(v1, v2):
sum1 = sum(v1)
sum2 = sum(v2)
sum1Sq = sum([pow(v, 2) for v in v1])
sum2Sq = sum([pow(v, 2) for v in v2])
pSum = sum([v1[i] * v2[i] for i in range(len(v1))])
num = pSum - (sum1*sum2/len(v1))
den = sqrt((sum1Sq-pow(sum1, 2)/len(v1))*(sum2Sq-pow(sum2, 2)/len(v1)))
if den == 0:
return 0
return 1.0-num/den
3、python爬虫实现
首先呢,当然是下载BeautifulSoup.py, 是一个HTML和XML文档的python解析模块
地址:http://www.crummy.com/software/BeautifulSoup
接下来呢,编写一个爬虫模块,简单代码searchengine.py如下:
import urllib2
from BeautifulSoup import *
from urlparse import urljoin
ignorewords = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it'])
class crawler:
def __init__(self, dbname):
pass
def __del__(self):
pass
def dbcommit(self):
pass
def getentryid(self, table, field, value, createnew = True):
return None
def addtoindex(self, url, soup):
print 'Indexing %s' % url
def gettextonly(self, soup):
return None
def separatewords(self, text):
return None
def isindexed(self, url):
return False
def addlinkref(self, urlFrom, urlTo, linkText):
pass
def crawl(self, pages, depth = 2):
for i in range(depth):
newpages = set()
for page in pages:
try:
c = urllib2.urlopen(page)
except:
print "could not open %s" % page
continue
soup = BeautifulSoup(c.read())
self.addtoindex(page, soup)
links = soup('a')
for link in links:
if('href' in dict(link.attrs)):
url = urljoin(page, link['href'])
if url.find("'")!=-1:
continue
url = url.split('#')[0]
if url[0:4] == 'http' and not self.isindexed(url):
newpages.add(url)
linkText = self.gettextonly(link)
self.addlinkref(page, url, linkText)
self.dbcommit()
pages = newpages
def createindextables(self):
pass
然后只要导入这个模块,然后运行就好了:
>>> import sys
>>> sys.path.append('F:\\pythonP') #这是本人放置searchengine.py的位置
>>> import searchengine
>>> pagelist = ['http://kiwitobes.com/wiki/Perl.html']
>>> crawler = searchengine.crawler('')
>>> crawler.crawl(pagelist)
4、出错情况2
dest = a[1]
IndexError: list index out of range
或者
ValueError: need more than 1 value to unpack
这两种错误有可能出现在读取文件时
>>> file_object = open('F:\\pythonP\\schedule.txt')
>>> flights = {}
>>> for line in file_object:
origin, dest, depart, arrive, price = line.strip().split(',')
flights.setdefault((origin, dest), [])
flights[(origin, dest)].append((depart, arrive, int(price)))
或者
file_object = open('F:\\pythonP\\schedule.txt')
#for line in file('F:\\pythonP\\schedule.txt'):
for line in file_object:
a = line.strip().split(',')
origin = a[0]
dest = a[1]
depart = a[2]
arrive = a[3]
price = a[4]
flights.setdefault((origin, dest), [])
flights[(origin, dest)].append((depart, arrive, int(price)))
这样的代码是没问题的,问题出在schedule.txt'中多了几行空行,去掉就没问题了
5、如果需要将自己的django网站能在自己的局域网中访问到,可以通过一下命令:
python manage.py runserver 0.0.0.0:8000
当然可能还需要将防火墙关掉。