'''
校园新闻网列表页User-agent反爬
'''
from common.tools import Tool
import requests
from scrapy.selector import Selector
class User_Agent:
def __init__(self):
self.tool = Tool.get_tool()
def run(self):
url = 'http://www.porters.vip/verify/uas/index.html'
headers = {}
headers = self.tool.get_user_agent(headers)
r = requests.get(url=url,headers=headers)
if r.status_code == 200:
titles = Selector(r).xpath("//ul[@class='list-group']/li/text()").extract()
for title in titles:
print(title)
if __name__ == '__main__':
User_Agent().run()
Tools:
import os
from PIL import Image
import io
import pytesseract
import requests
from fake_useragent import UserAgent
class Tool:
@classmethod
def get_tool(cls):
return cls()
def get_user_agent(self,headers):
ua = UserAgent()
headers['User-Agent'] = ua.random
return headers
def get_cookie(self,cookie_str):
c_b = cookie_str.split(',')
cookies = {}
for cookie in c_b:
k,v = cookie.split('=')
cookies[k] = v
return cookies
def tessseract_img(self,img_url):
# 识别图片文字数字
img_body = requests.get(img_url).content
image_stream = Image.open(io.BytesIO(img_body))
image_stream = image_stream.convert('L')
if os.path.exists('a.jpg'):
os.remove('a.jpg')
image_stream.save('a.jpg')
Image.open(r'a.jpg')
return pytesseract.image_to_string(image_stream,lang='eng')
----参考python3反爬虫原理与绕过实战