成功的秘诀,在永不改变既定的目的
在看到某人的诉求后,并有高手回答,看过后我get了方法,并优化创新方法
带表格信息的网站(网址代码里面有),例如:
我对上面四个网站都进行了,爬取,基本相似,但有点区别
"""
@author: cht
@time: 2019/8/18 13:12
"""
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import pandas as pd
from lxml import etree
def run():
url="https://mp.weixin.qq.com/s/li7BbNrZy-eOm79D6Eh-mA"
req=requests.get(url)
mylist=[["跳蚤属性"],["房源小区名字"],["具体位置"],["组团区域"],["户型"],["面积(㎡)"],["价格(元)"],["楼层"],["装修情况"],["其他信息"],["联系人"],["联系电话"]]
soup=BeautifulSoup(req.content,"html.parser")
for it in soup.find_all('tbody',attrs={"style":"box-sizing: border-box;"}):
# print(it)
for i in range(len(it.contents)):
# print(i)
data = it.contents[i].contents[1].text
# print(data)
mylist[i].append(data)
print(mylist)
print(mylist)
print(mylist)
out=pd.DataFrame(mylist)
out.to_excel(r'1.xls')
#该run方法比较费时,没有充分使用BeautifulSoup库的强大解析功能,但是这个方法已经惊艳到我了,导致我去学习了其他优化方法,感谢这位大哥
def csres():
url = "http://www.csres.com/notice/50655.html"
res = requests.get(url)
res_elements = etree.HTML(res.text)
table = res_elements.xpath('//table[@style="border-collapse: collapse"]')
table = etree.tostring(table[0], encoding='utf-8').decode()
print(type(table))
df = pd.read_html(table, encoding='utf-8', header=0)[0]
results = list(df.T.to_dict().values())
print(results)
# df.to_csv("./std1.csv", index=False)
# 方法二
soup = BeautifulSoup(res.content, "html.parser")
table = soup.find_all('table', attrs={"style": "border-collapse: collapse"})[0].decode()
print(type(table))
df = pd.read_html(table, encoding='utf-8', header=0)[0]
results = list(df.T.to_dict().values())
print(results)
# df.to_csv("./std2.csv", index=False)
def cbrc():
url = "http://www.cbrc.gov.cn/chinese/home/docViewPage/110009¤t=1"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
}
res = requests.get(url,headers= headers)
print(res.content)
soup = BeautifulSoup(res.content, "html.parser")
table = soup.find_all('table', attrs={"id": "testUI"})[0].decode()
print(type(table))
df = pd.read_html(table, encoding='utf-8', header=0)[0]
results = list(df.T.to_dict().values())
print(results)
# df.to_csv("./cdrc.csv", index=False)
def stockQuery():
url = 'http://s.askci.com/stock/a/?reportTime=2019-03-31&pageNum=1#QueryCondition'
#URL这部分可以优化,pageNum=的值改变就可以换页,页面可以获取总页数,for循环即可
res = requests.get(url)
print(res.content)
soup = BeautifulSoup(res.content, "html.parser")
table = soup.find_all('table', attrs={"class": "fancyTable"})[0].decode()
print(table)
print(type(table))
df = pd.read_html(table, encoding='utf-8', header=0)[0]
results = list(df.T.to_dict().values())
print(results)
df.to_csv("./stock.csv", index=False)
if __name__ =="__main__":
# run()
# csres()
# cbrc()
stockQuery()
结果: