import urllib
import re
import pandas as pd
import pymysql
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import pprint
from sqlalchemy import create_engine
第二步,定义download函数
def download_all_htmls():
htmls = []
for idx in range(1):
url = f"https://data.eastmoney.com/zjlx/000001.html"
print("craw html:", url)
r = requests.get(url)
if r.status_code != 200:
raise Exception("error")
htmls.append(r.text)
return htmls
第三步,执行爬取
htmls = download_all_htmls()
def parse_single_html(html):
soup = BeautifulSoup(html, 'html.parser')
article_items = (
soup.find("div", class_="sinstock-filter-wrap")
.find("table")
.find("tbody")
.find_all("tr")
)
print(type(article_items))
datas = []
for article_items in article_items:
trnum=article_items.find_all('td')
for i in trnum:
# print(i.get_text())
datas.append(i.get_text())
print(datas)
return datas
第四步,解析HTML
all_datas = []
for html in htmls:
all_datas.extend(parse_single_html(html))
print(all_datas)
len(all_datas)
df = pd.DataFrame(all_datas)