import asyncio
# from multiprocessing import Pool
from tqdm import tqdm
import pandas as pd
from pyppeteer import launch
proxy = 'http://***.***.***.***:***'
args = [f'--proxy-server={proxy}', '--disable-infobars', '--window-size=1920,1080', '--no-sandbox']
username = "******"
password = "******"
concept_url = "http://q.10jqka.com.cn/gn/"
async def create_browser():
browser = await launch(
headless=False, args=args, userDataDir='./user_data',
ignoreDefaultArgs=['--enable-automation']
)
return browser
async def get_concept_code(page):
try:
concept_code_el = await page.querySelector(
"body > div.container.w1200 > div:nth-child(3) > div.body > div > div.board-main.w900 > "
"div.heading > div.board-hq > h3 > span"
)
concept_code = await (await concept_code_el.getProperty('textContent')).jsonValue()
except Exception as e:
print("get_concept_code error:", e)
concept_code = ""
return concept_code
async def get_stock_text(concept, concept_code, page):
concept_stock = list()
stock_elements = await page.querySelectorAll("#maincont > table > tbody > tr")
for stock in stock_elements:
try:
code_el = (await stock.querySelectorAll("td"))[1]
name_el = (await stock.querySelectorAll("td"))[2]
code = await (await code_el.getProperty('textContent')).jsonValue()
name = await (await name_el.getProperty('textContent')).jsonValue()
concept_stock.append([concept_code, concept, code, name])
except Exception as e:
print("get_stock_text error:", e)
return concept_stock
async def spider_stock_data(url, concept, browser):
page = (await browser.pages())[0]
await page.setJavaScriptEnabled(True)
await page.setViewport({'width': 1920, 'height': 1080})
await page.authenticate({'username': username, 'password': password})
await page.goto(url)
concept_code = await get_concept_code(page)
concept_stock = await get_stock_text(concept, concept_code, page)
tab_ind = await page.querySelectorAll("#m-page > a")
tab_length = len(tab_ind)
if tab_length > 0:
start_tab, last_tab = tab_ind[0], tab_ind[-1]
last_tab_str = await (await last_tab.getProperty('textContent')).jsonValue()
cur_tab = await page.querySelector("#m-page > a.cur")
pre_cur_str = await (await cur_tab.getProperty('textContent')).jsonValue()
while last_tab_str == "尾页":
try:
await page.click(f"#m-page > a:nth-child({tab_length - 1})")
await page.waitFor(200)
cur_tab = await page.querySelector("#m-page > a.cur")
cur_str = await (await cur_tab.getProperty('textContent')).jsonValue()
while cur_str == pre_cur_str:
await page.waitFor(50)
cur_tab = await page.querySelector("#m-page > a.cur")
cur_str = await (await cur_tab.getProperty('textContent')).jsonValue()
concept_stock.extend(await get_stock_text(concept, concept_code, page))
pre_cur_str = cur_str
tab_ind = await page.querySelectorAll("#m-page > a")
tab_length = len(tab_ind)
start_tab, last_tab = tab_ind[0], tab_ind[-1]
last_tab_str = await (await last_tab.getProperty('textContent')).jsonValue()
except Exception as e:
print("Error:", e)
print("Url:", url)
print("pre_cur_str:", pre_cur_str)
concept_stock = pd.DataFrame(concept_stock)
concept_stock.rename(
columns={0: "concept_code", 1: "concept_name", 2: "stock_code", 3: "stock_name"},
inplace=True
)
return concept_stock
async def spider_concept_data(url, browser):
page = await browser.newPage()
await page.setJavaScriptEnabled(True)
await page.setViewport({'width': 1920, 'height': 1080})
await page.authenticate({'username': username, 'password': password})
await page.goto(url)
concept_el = await page.querySelectorAll(
"body > div.container.w1200 > div.category.boxShadow.m_links > div > div > div > a"
)
concept_list = list()
for concept in concept_el:
concept_name = await (await concept.getProperty('textContent')).jsonValue()
href = await (await concept.getProperty('href')).jsonValue()
concept_list.append([concept_name, href])
return concept_list
async def main():
browser = await create_browser()
concept_df = await spider_concept_data(concept_url, browser)
concept_df = pd.DataFrame(concept_df)
concept_df.rename(columns={0: "concept", 1: "href"}, inplace=True)
concept_df.to_csv("ths_concept_20220113.csv", index=False)
concept_df = pd.read_csv("ths_concept_20220113.csv")
for i in tqdm(concept_df.index):
concept_name = concept_df.loc[i, "concept"]
href = concept_df.loc[i, "href"]
concept_stock = await spider_stock_data(href, concept_name, browser)
if i == 0:
concept_stock.to_csv("ths_concept_stock_20220113.csv", index=False)
else:
concept_stock.to_csv("ths_concept_stock_20220113.csv", index=False, mode="a", header=None)
await browser.close()
asyncio.get_event_loop().run_until_complete(main())
模拟浏览器点击进行爬取同花顺概念板块数据
最新推荐文章于 2024-07-21 21:11:33 发布