分析网站
首先打开谷歌浏览器,F12打开开发者调试工具
代码
话不多说,直接上代码
import json
import random
import time
import requests
import logging
import re
import pymongo
from pyquery import PyQuery as pq
import urllib
import multiprocessing
'''
requests 用来爬取页面,
logging 用来输出信息,
re 用来实现正则表达式解析,
pyquery 用来直接解析网页,
pymongo 用来实现MongoDB存储,
multiprocessing 采用多线程来优化
'''
logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(levelname)s: %(message)s')
ip = 'http://xxxxxx'
def api(thisapi):
count=0
while count<10:
try:
urllib.request.urlcleanup()
thisip = urllib.request.urlopen(thisapi).read().decode("utf-8", "ignore")
print("当前用的ip是:" + thisip)
thisip = re.findall(r'([0-9\.:]+)', thisip)[0]
if thisip:
proxy_temp = {
"http": "http://" + thisip}
return proxy_temp
except Exception as e:
#time.sleep(3)
count+=1
print("代理ip无效!正在第{}次重新切换中...".format(count))
continue
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
def get_url(url):
proxy=api(ip)
count=0
while