因为合作的关系,用户在百度上搜索一些检索词时,百度会把我们网站放到搜索结果的第一位,但是经过实践发现,好多关键字都不是这样的。
所以写了一个小爬虫来检验2000-3000个关键字的移动和pc端百度搜索结果。
利用put的方法提交url,然后对返回的数据进行正则匹配,
找出第一位的搜索结果是不是含有我们网站的链接。
python
# coding=utf-8
import urllib
import urllib2
from urllib2 import Request, urlopen, URLError, HTTPError
import re
import time
import random
# 将正则表达式编译成Pattern对象
#pattern = re.compile(r'(id="1"){1}.{2,30}(book.zongheng.com){1}.*\.html') #pc
pattern = re.compile(r'(>1.{0,6}<em>){1}.*(zongheng\.com){1}.*(2&?#?160)') #wap\h5
output = open('D:\\result.txt', 'a')
k=0
# 使用Pattern匹配文本,获得匹配结果,无法匹配时将返回None
#match = pattern.search('http://book.zongheng.com/book/262883.html?fr=p')
def search(key):
global k
url = 'http://m.baidu.com/s?word='+key
user_agent = 'Mozilla/4.1 (compatible; MSIE 5.5; Windows NT)'
values = {'wd' : '111' }
headers = { 'User-Agent' : user_agent }
data = urllib.urlencode(values) # 编码工作
req = urllib2.Request(url) # 发送请求同时传data表单
#response = urllib2.urlopen(req) #接受反馈的信息
#the_page = response.read() #读取反馈的内容
#print the_page
#req = Request('http://bbs.csdn.net/callmewhy')
try:
response = urllib2.urlopen(req)
except URLError, e:
if hasattr(e, 'code'):
print key+' The server couldn\'t fulfill the request.'
print 'Error code: ', e.code
search(key)
elif hasattr(e, 'reason'):
print key+' We failed to reach a server.'
print 'Reason: ', e.reason
search(key)
else:
#print 'No exception was raised.'
# everything is fine
the_page = response.read() #读取反馈的内容
#print 'Real url :' + response.geturl()
#print response.info()
#print the_page
k+=1
if k%20==0:
time.sleep(30)
if '即可恢复使用' in the_page:
print 'wait'
time.sleep(30)
print 'wait done'
search(key)
#return 1
match = pattern.search(the_page)
if match:
pass
# 使用Match获得分组信息
#print key +" this key is ok"
#print match.group()
else:
print key+" this key is not ok"
output .write(key+"\n\t")
search('水系法师的春天')
file = open("D:\\test.txt")
while 1:
lines = file.readlines(100000)
if not lines:
break
for line in lines:
time.sleep(2*random.random())
search(line.strip())
output .close()