重要模块
- requests:获取网页
- pyquery:解析网页
- fake_useragent:更改请求的User-Agent设置
- re:正则表达式筛选内容
上重要代码
- 获取网页
proxies = { #代理设置
"http": 'http://'+proty,
"https": 'https://'+proty,
}
html = requests.get(url, proxies=proxies,timeout=4,headers={"user-agent":UserAgent().random})#获取到的网页
html.encoding = 'utf-8' # 将得到的网页编码设置成utf8
if html.status_code == 200: # 判断请求是否成功
print('请求成功:' + url+',状态码:'+str(html.status_code)+'......')
content=pq(html.text) # pq是是引入时给pyquery加的别名
- 解析网页
- 获取内容
sex = content("#aaa").text()
- 获取标签属性:例如a标签的href
content("#aaa a").attr('href')
- 循环操作标签,如获取到了一组tr标签
for item in content("tr").items(): # 注意此处获取到了tr标签后需得使用它的items()方法才行的。
print(item.text)
项目代码:封装获取网站代码
注意:这里仅获取了网页,并处理成pyquery的对象,返回了出去,且用到了代理池、线程锁等等,需要看我后面写的文才能贯通,这里贴出来只是给个思路。
代码
import requests # 导入requests包c
from pyquery import PyQuery as pq
from getProtyPool import getPool
from toMongodb.monSearchMany import mon_search_many
from toMongodb.monDeleteOne import mon_delete_one
from fake_useragent import UserAgent
import re,threading
# 修改protyPool资源的线程锁
threadLock = threading.Lock()
# 获取代理池
protyPool = []
for iii in mon_search_many('ip_cool', {}, {"proty": 1}) or getPool.getProtyPool():
protyPool.append(iii["proty"])
def testToProtyUrl(proty, url):
proxies = {
"http": 'http://'+proty,
"https": 'https://'+proty,
}
print('正在使用代理:http://'+proty)
try:
html = requests.get(url, proxies=proxies,timeout=4,headers={"user-agent":UserAgent().random})
html.encoding = 'utf-8'
if html.status_code == 200:
print('请求成功:' + url+',状态码:'+str(html.status_code)+'......')
return pq(html.text)
else:
if(re.sub(r'^4\d+', '', str(html.status_code)) == ''):
print('请求失败:'+url+',状态:' +
str(html.status_code), ',资源拒绝,正在删除该代理,即将更换代理...')
mon_delete_one('ip_cool', {"proty": proty})
else:
print('请求失败:'+url+',状态:' +
str(html.status_code), ',即将更换代理......................')
return getHtml(url)
except:
print('连接错误,正在删除该代理,即将更换代理....................')
mon_delete_one('ip_cool', {"proty": proty})
return getHtml(url)
def getMorePool():
print('当前代理池少于1条,即将增加IP池......')
for item in mon_search_many('ip_cool', {}, {"proty": 1}):
protyPool.append(item["proty"])
if(len(protyPool) < 20):
for ite in getPool.getProtyPool():
protyPool.append(ite)
if(len(protyPool) < 20):
getMorePool()
def getHtml(url):
threadLock.acquire()
if(len(protyPool) > 0):
print('当前代理池数目:'+str(len(protyPool)))
proty = protyPool.pop()
threadLock.release()
return testToProtyUrl(proty, url)
else:
getMorePool()
threadLock.release()
return getHtml(url)
结尾
后续会有更多