python 动态采集
Selenium 用python写爬虫的时候,主要用的是selenium的Webdriver
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys #引入keys包 要想调用键盘按键操作需要引入keys包:
#通过send_keys()调用按键:
#send_keys(Keys.TAB) TAB
#send_keys(Keys.ENTER) 回车
#输入框输入内容
#driver.find_element_by_id("id").send_keys("输入的东东")
def main():
driver = webdriver.Chrome() #webdriver是浏览器的驱动器,要驱动浏览器必须用到webdriver,支持多种浏览器,这里用Chrome作为例子
driver.get('要采集的地址')
soup = BeautifulSoup(driver.page_source, 'lxml') #selenium的page_source方法可以获取到页面源码 创建 beautifulsoup 对象
for img_tag in soup.body.select('img[src]'):
print(img_tag.attrs['src'])
#同时对于soup.select('a.cla‘表示class属性为cla的所有标签。 img标签src属性值
#select返回的是tag类型的列表,所以我们可以继续使用上面的方法获得属性
''''''
for a in soup.select('p a'):
#方法一
print(a['href'])
#方法二
print(a.attrs['href'])
''''''
if __name__ == '__main__':
main()
selenium.webdriver.common.keys.Keys
ADD = '\ue025'
ALT = '\ue00a'
ARROW_DOWN = '\ue015'
ARROW_LEFT = '\ue012'
ARROW_RIGHT = '\ue014'
ARROW_UP = '\ue013'
BACKSPACE = '\ue003'
BACK_SPACE = '\ue003'
CANCEL = '\ue001'
CLEAR = '\ue005'
COMMAND = '\ue03d'
CONTROL = '\ue009'
DECIMAL = '\ue028'
DELETE = '\ue017'
DIVIDE = '\ue029'
DOWN = '\ue015'
END = '\ue010'
ENTER = '\ue007'
EQUALS = '\ue019'
ESCAPE = '\ue00c'
F1 = '\ue031'
F10 = '\ue03a'
F11 = '\ue03b'
F12 = '\ue03c'
F2 = '\ue032'
F3 = '\ue033'
F4 = '\ue034'
F5 = '\ue035'
F6 = '\ue036'
F7 = '\ue037'
F8 = '\ue038'
F9 = '\ue039'
HELP = '\ue002'
HOME = '\ue011'
INSERT = '\ue016'
LEFT = '\ue012'
LEFT_ALT = '\ue00a'
LEFT_CONTROL = '\ue009'
LEFT_SHIFT = '\ue008'
META = '\ue03d'
MULTIPLY = '\ue024'
NULL = '\ue000'
NUMPAD0 = '\ue01a'
NUMPAD1 = '\ue01b'
NUMPAD2 = '\ue01c'
NUMPAD3 = '\ue01d'
NUMPAD4 = '\ue01e'
NUMPAD5 = '\ue01f'
NUMPAD6 = '\ue020'
NUMPAD7 = '\ue021'
NUMPAD8 = '\ue022'
NUMPAD9 = '\ue023'
PAGE_DOWN = '\ue00f'
PAGE_UP = '\ue00e'
PAUSE = '\ue00b'
RETURN = '\ue006'
RIGHT = '\ue014'
SEMICOLON = '\ue018'
SEPARATOR = '\ue026'
SHIFT = '\ue008'
SPACE = '\ue00d'
SUBTRACT = '\ue027'
TAB = '\ue004'¶
UP = '\ue013'