总结
-
# selenium基本用法 from selenium import webdriver from selenium.webdriver.common.keys import Keys import time # 1.创建浏览器对象 # 注意:浏览器对象如果是局部变量,操作结束后浏览器会自动关闭, 如果是全局变量则浏览器需要手动关闭 # web = webdriver.Chrome() # 2. 打开网页 # web.get('https://www.jd.com') # web.close() 可以关闭浏览器 with webdriver.Chrome() as web: web.get('https://www.jd.com') # 最大化窗口 # web.maximize_window() # 3. 获取网页数据 # 注意:在获取浏览器page_source值的时候,只能获取到当前浏览器已经加载出来的数据 print(web.page_source) # 4. 获取和操作标签 # 1) 输入框操作: 获取输入框 -> 输入内容 -> 按回车 # 获取输入框 input = web.find_element_by_id('key') # 输入内容 input.send_keys('笔记本电脑') # 按回车键 input.send_keys(Keys.ENTER) time.sleep(0.1) # 获取输入框 input2 = web.find_element_by_id('key') # 情况原来输入框中的文字 input2.clear() # input2.send_keys(Keys.BACKSPACE*10) # 输入内容 input2.send_keys('86键盘') # 按回车键 search = web.find_element_by_css_selector('#search > div > div.form > button') search.click() # input2.send_keys(Keys.ENTER) time.sleep(0.1) # 5. 回退和前进 web.back() time.sleep(0.1) web.forward() time.sleep(60)
-
# selenium选项卡 from selenium import webdriver import time from selenium.webdriver.common.keys import Keys with webdriver.Chrome() as web: # web = webdriver.Chrome() web.get('https://www.jd.com') # 获取秒杀对应的超链接标签 miaosha = web.find_element_by_xpath('//*[@id="navitems-group1"]/li[1]/a') miaosha.click() time.sleep(3) web.set_page_load_timeout(10) web.switch_to.window(web.window_handles[0]) time.sleep(100)
-
# selenium页面滚动 from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.ui import WebDriverWait from bs4 import BeautifulSoup from lxml import etree import re import time import csv # 页面滑动 def page_generator(): web = webdriver.Chrome() web.get('https://www.jd.com') # web.get('https://search.jd.com/Search?keyword=%E7%AC%94%E8%AE%B0%E6%9C%AC%E7%94%B5%E8%84%91&pvid=48eda17852a64eba9eff17ee98858810&page=75&s=2221&click=0') web.maximize_window() input = web.find_element_by_id('key') input.send_keys('笔记本电脑') input.send_keys(Keys.ENTER) # web.set_page_load_timeout(10) old_url = web.current_url while True: time.sleep(0.1) max_height = 10000 position = 200 while True: web.execute_script(f'window.scrollTo(0, {position})') position += 200 if position > max_height: break time.sleep(0.1) yield web.page_source body = web.find_element_by_tag_name('body') body.send_keys(Keys.RIGHT) # next_btn = web.find_element_by_css_selector('#J_bottomPage > span.p-num > a.pn-next') # next_btn.click() time.sleep(1) if old_url == web.current_url: break old_url = web.current_url web.close() # with open(r'./temp.html', 'w', encoding='utf-8') as f: # f.write(html) # # with open(r'./temp.html', 'r', encoding='utf-8') as f: # html = f.read() # soup = BeautifulSoup(html, 'lxml') # li_s = soup.select('#J_goodsList > ul > li') # for li in li_s: # title = li.select_one('div > div.p-name.p-name-type-2 > a > em').get_text() # detail = li.select_one('div > div.p-img > a').attrs['href'] # img = li.select_one('div > div.p-img > a > img').attrs['src'] # price = li.select_one('div > div.p-price').get_text() # comment = li.select_one('div > div.p-commit > strong').get_text() # # comment_url = li.select_one('div > div.p-commit > strong > a').atrrs['href'] # business = li.select_one('div > div.p-shop > span > a').get_text() # print(title, detail, img, price, comment, business) def analysis_data(content: str, all_list: list): html = etree.HTML(content) li_s = html.xpath('/html/body/div[5]/div[2]/div[2]/div[1]/div/div[2]/ul/li') https = 'https:' for li in li_s: title = li.xpath('./div/div[3]/a//*/text()') detail = https + li.xpath('./div/div[1]/a/@href')[0] img = li.xpath('./div/div[1]/a/img/@src') img = https + img[0] if img else '' price = li.xpath('./div/div[2]/strong/i/text()') price = price[0] if price else '' comment_num = li.xpath('./div/div[4]/strong/a/text()') comment_num = comment_num[0] if comment_num else '' comment_url = li.xpath('./div/div[4]/strong/a/@href') comment_url = https + comment_url[0] if comment_url else '' business = li.xpath('./div/div[5]/span/a/text()') business = business if business else '' tags = li.xpath('./div/div[6]/i/text()') tags = tags if tags else '' one_list = ['|'.join([re.sub(r'\s+', '', x) for x in title]), detail, img, price, comment_num, comment_url, business, '|'.join(tags).strip()] all_list.append(one_list) # print('|'.join([re.sub(r'\s+', '', x) for x in title]), detail, img, price, comment_num, comment_url, business, '|'.join(tags).strip()) if __name__ == '__main__': data_list = [] page = page_generator() with open(r'./jd.csv', 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(['title', 'detail', 'img', 'price', 'comment_num', 'comment_url', 'business', 'tags']) with open(r'./jd.csv', 'a', newline='', encoding='utf-8') as f: while True: try: analysis_data(next(page), data_list) except StopIteration: break finally: writer = csv.writer(f) writer.writerows(data_list) data_list.clear()