陆金所投资爬虫
#coding=utf-8
import re
import urllib.request
from urllib.error import URLError, HTTPError
import time
import sys
import webbrowser
import socket
sys.path.append("libs")
def getHtml(url):
fails = 0
while True:
try:
if fails >= 20:
break
page = urllib.request.urlopen(url)
html = page.read()
return html
except:
fails += 1
#print ('网络连接出现问题, 正在尝试再次请求: ', fails)
else:
break
def getUrl(html):
reg = r'(?<=<a href=\'/list/).*(?=\' target="_blank" title=[\s\S]{50,500}<p class="num-style">8\.40%</p>)'
imgre = re.compile(reg)
imglist = re.search(imgre, html.decode('utf-8'))
return imglist
def haveNextPage(html):
reg = r'btns btn_page disabled btn_small next'
imgre = re.compile(reg)
if (re.search(imgre, html.decode('utf-8'))):
return False
else:
return True
timeout = 3
socket.setdefaulttimeout(timeout)
while True:
minMoney = 10000
maxMoney = 15000
targetUrl = "https://list.lu.com/list/transfer/anyi?minMoney=" + str(minMoney) + "&maxMoney=" + str(maxMoney) + "&minDays=&maxDays=&minRate=&maxRate=&mode=&subType=&instId=&haitongGrade=&fundGroupId=&trade=FIX_PRICE&isCx=¤tPage=1&orderType=transfer_price&orderAsc=true"
while True:
print (targetUrl)
html = getHtml(targetUrl)
url = getUrl(html)
if (url):
print (url)
print (targetUrl)
webbrowser.open('https://list.lu.com/list/' + url.group())
exit()
if (not haveNextPage(html)):
break
else:
targetUrl = targetUrl[0: targetUrl.index('currentPage=') + 12] + str(int(targetUrl[targetUrl.index('currentPage=') + 12: targetUrl.index('&orderType')]) + 1) + targetUrl[targetUrl.index('&orderType'):]
#coding=utf-8
import re
import urllib.request
from urllib.error import URLError, HTTPError
import time
import sys
import webbrowser
import socket
sys.path.append("libs")
def getHtml(url):
fails = 0
while True:
try:
if fails >= 20:
break
page = urllib.request.urlopen(url)
html = page.read()
return html
except:
fails += 1
#print ('网络连接出现问题, 正在尝试再次请求: ', fails)
else:
break
def getUrl(html):
reg = r'(?<=<a href=\'/list/).*?(?=\' target="_blank")'
imgre = re.compile(reg)
imglist = re.search(imgre, html.decode('utf-8'))
return imglist
timeout = 3
socket.setdefaulttimeout(timeout)
minMoney = 20000
maxMoney = 30000
targetUrl = "https://list.lu.com/list/transfer/anyi?minMoney=" + str(minMoney) + "&maxMoney=" + str(maxMoney) + "&minDays=&maxDays=&minRate=0.083&maxRate=0.085&mode=&subType=&instId=&haitongGrade=&fundGroupId=&trade=FIX_PRICE&isCx=¤tPage=1&orderType=transfer_price&orderAsc=true"
while True:
print (targetUrl)
html = getHtml(targetUrl)
url = getUrl(html)
if (url):
webbrowser.open('https://list.lu.com/list/' + url.group())
exit()
百度网页url抓取爬虫
#coding=utf-8
import re
import urllib.request
from urllib.error import URLError, HTTPError
import time
import socket
def getHtml(url):
fails = 0
while True:
try:
if fails >= 20:
break
page = urllib.request.urlopen(url)
html = page.read()
return html
except:
fails += 1
print ('网络连接出现问题, 正在尝试再次请求: ', fails)
else:
break
def getUrl(html):
reg = r'(?<= href = ").*(?="\n\n\t\t target="_blank")'
imgre = re.compile(reg)
imglist = re.findall(imgre, html.decode('utf-8'))
return imglist
def getNextUrl(html):
reg = r'(?<=</span></a><a href=").{50,300}(?<!</span>)(?=" class="n">下)'
imgre = re.compile(reg)
imglist = re.findall(imgre, html.decode('utf-8'))
return imglist
timeout = 3
socket.setdefaulttimeout(timeout)
total = 0
inputList = open('qList.txt')
for filename in inputList:
outputList = open(filename[:-1] + '.txt', 'w')
print (filename[:-1])
targetUrl = "http://www.baidu.com/s?wd=" + urllib.request.quote(filename[:-1])
print (targetUrl)
while True:
html = getHtml(targetUrl)
urls = getUrl(html)
if (urls):
for url in urls:
try:
u = urllib.request.urlopen(url)
redirectUrl = u.geturl()
except:
continue
else:
print (redirectUrl)
outputList.write(redirectUrl + '\n')
total = total + 1
nextUrl = getNextUrl(html)
if (nextUrl):
targetUrl = "http://www.baidu.com" + nextUrl[0]
else:
outputList.close()
break
print ('done!')
inputList.close()