1 # -*- coding: cp936 -*-
2 import urllib2
3 import re
4 from pyquery import PyQuery as pq
5 from lxml import etree
6
7 #mailpattern = re.compile('[^\._:>\\-][\w\.-]+@(?:[A-Za-z0-9]+\.)+[A-Za-z]+')
8 mailpattern = re.compile('[A-Za-z0-9_]+@(?:[A-Za-z0-9]+\.)+[A-Za-z]+')
9
10 htmlcount = 0 #to count the urls
11 maxcount = 3000 # the max count
12 allUrls = set()
13 allMails = set()
14 UrlsQlist = []
15 UrlsQdict = {}
16 url = "http://www.163.com"
17 fmails = open("E:/py/crawler/mailresult.txt","a")
18 furls = open("E:/py/crawler/urlresult.txt","a")
19
20
21
22
23 def geturls(data):#the function to get the urls in the html
24 urls = set()
25 if data:
26 d = pq(data)
27 label_a = d.find('a')#用pyquery库去找到 a 标签.
28 if label_a:
29 label_a_href = d('a').map(lambda i,e:pq(e)('a').attr('href'))
30 for u in label_a_href:
31 if u[0:10]!="javascript" :
32 if u[0:4] == "http":
33 urls.add(u)
34 else:
35 urls.add(url + u)
36 #for u in urls:
37 #print u
38 return urls
39 else:
40 return None
41
42 def gethtml(url):
43 try:
44 fp = urllib2.urlopen(url)
45 except:
46 print "urllib2.urlopen error"
47 return None
48 else:
49 mybytes =fp.read()
50 fp.close()
51 return mybytes
52
53 def savemails(data): # the function to save the emails
54 if data:
55 mail