本文以CSDN论坛为例:
- 源码:
from selenium import webdriver
from lxml import html
from PIL import Image
from selenium.webdriver.support.select import Select
import requests
import re
import urllib
import time
import cv2
import pytesseract
import socket
import numpy as np
def MidString(content,startStr,endStr): #
startIndex = content.index(startStr) #
if startIndex>=0: #
startIndex += len(startStr) #
endIndex = content.index(endStr) #
return content[startIndex:endIndex] #
def html():
#伪装成浏览器访问,直接访问的话csdn会拒绝
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {'User-Agent':user_agent}
Ques_Url = 'https://blog.csdn.net/qq_34916678'
r = requests.get(Ques_Url,headers=headers)
a = r.text
pattern = 'style="min-width:58px" title=".*">'
Res_1 = re.search(pattern,a)
Res_2 = MidString(str(Res_1),"title=\"","\">")
print(Res_2)
if __name__ == "__main__":
#SATRT()
html()
程序仅作学习交流用途,请勿用于其他非法、商业等营利性用途!