前言
- 下方代码中的get_proxies方法,根据自身情况适当调整
代码
import random
import hashlib
from datetime import datetime
from string import ascii_letters
import requests
class BaseSpider:
def __init__(self, session=False):
self.worker = requests.Session() if session else requests
def get_rand_ua(self) -> str:
a = random.randint(55, 100)
c = random.randint(0, 3200)
d = random.randint(0, 140)
os_type = [
'(Windows NT 6.1; WOW64)', '(Windows NT 10.0; WOW64)', '(X11; Linux x86_64)',
'(Macintosh; Intel Mac OS X 10_12_6)'
]
chrome_version = 'Chrome/{}.0.{}.{}'.format(a, c, d)
ua = ' '.join(
['Mozilla/5.0', random.choice(os_type), 'AppleWebKit/537.36',
'(KHTML, like Gecko)', chrome_version, 'Safari/537.36']
)
return ua
def get_rand_head(self) -> dict:
headers = {
"User-Agent": self.get_rand_ua()
}
return headers
def get_proxies(self) -> dict:
return {}
def get_rand_name(self, length=9) -> str:
li = [
[x for x in ascii_letters],
[str(x) for x in range(10)]
]
temp = [random.choice(li[random.randint(0, 1)]) for _ in range(length)]
name = ''.join(temp)
return name
def get_curr_time(self) -> str:
curr_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
return curr_time
def get_md5(self, s: str) -> str:
value = hashlib.md5(s.encode()).hexdigest()
return value
def parse(self, url, timeout=6, retry=2, hope_code=200):
headers = self.get_rand_head()
proxies = self.get_proxies()
for _ in range(retry + 1):
try:
resp = self.worker.get(url, headers=headers, proxies=proxies, timeout=timeout)
except Exception as e:
print("ERROR >> parse >> {}".format(e))
else:
if resp.status_code == hope_code:
return resp
else:
print("WARNING >> parse >> {} {}".format(resp.status_code, url))
def download_html(self, path: str, url: str, encoding=None, tips=True):
resp = self.parse(url)
if resp is None:
return False
path = path if path.endswith('.html') else '{}.html'.format(path)
with open(path, "w", encoding=encoding or "UTF8") as f:
f.write(resp.text)
if tips:
print('已生成 >> {}'.format(path))
return path
def download_img(self, path: str, url: str, tips=True):
resp = self.parse(url)
if resp is None:
return False
with open(path, "wb") as f:
f.write(resp.content)
if tips:
print('已生成 >> {}'.format(path))
return path
def save_file(self, path: str, text: str, tips=True):
with open(path, "w") as f:
f.write(text)
if tips:
print('已生成 >> {}'.format(path))
return path
def get_curr_ip(self) -> str:
url = "https://httpbin.org/ip"
resp = self.parse(url)
ip = resp.json()["origin"]
return ip
if __name__ == '__main__':
spider = BaseSpider()
ip = spider.get_curr_ip()
print(ip)