url='https://www.baidu.com/iframe/a.php?q={}&1={}&w={}'.format(code,starttime,endtime)
response = requests.get('https://www.baidu.com/iframe/a.php?q={}&1={}&w={}'.format(code,starttime,endtime), headers=headers)
tb=pd.read_html(url)
tb[0].to_csv(r'数据.csv',index=True,header=false)
import hashlib, json, requests, time, re, pymysql
from lxml import etree
host = '127.0.0.1'
users = 'root'
pwd = ''
port = 3306
dbs = 'python'
def mysql(id, user_name, level, detail, create_time):
"""
数据库的存储
:param id: 用户id
:param user_name: 用户姓名
:param level: 用户等级
:param detail: 评论内容
:param create_time: 发布时间
:return: None
"""
value = ((id, user_name, level, detail, create_time))
db = pymysql.connect(host=host, user=users, passwd=pwd, port=port, db=dbs)
cursor = db.cursor()
sql = "INSERT INTO mafengwo(id,user_name,level,detail,create_time) values(%s,%s,%s,%s,%s)"
try:
cursor.execute(sql, value)
db.commit()
print('success!')
except Exception as e:
db.rollback()
print("error.", e)
db.close()
def get_params(page, poi_id):
"""
获取params参数
:param page: 页码
:param poi_id: 酒店id
:return: params参数
"""
m = hashlib.md5()
ts = str(int(time.time() * 1000))
params = {
"_ts": ts,
"keyword_id": "0",
"page": f"{page}",
"poi_id": f"{poi_id}",
"type": "0"
}
salt = "c9d6618dbc657b41a66eb0af952906f1"
m.update((json.dumps(params, separators=(',', ':')) + salt).encode())
params.update({"_sn": m.hexdigest()[2:12]})
return params
def down(page,pid):
"""
获取响应值的html内容
:param page: 页码
:param pid: 酒店id
:return: 服务器响应回来的html标签
"""
url = "https://www.mafengwo.cn/hotel/info/comment_list"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.41",
}
params = get_params(page, pid)
return requests.get(url, headers=headers, params=params).json()["html"]
def func(res):
"""
解析数据并存入数据库
:param res: html的text格式内容
:return: None
"""
tree = etree.HTML(res)
div_list = tree.xpath('//div[@class="comm-item _j_comment_item"]')
for div in div_list:
user_name = ''.join(div.xpath('div[@class="user"]/a[@class="name"]/text()'))
level = ''.join(div.xpath('div[@class="user"]/a[@class="LV"]/text()'))
id = ''.join(re.findall("u=(.*?)&", div.xpath('div[@class="user"]/a[@class="avatar"]/@href')[0])) if \
div.xpath('div[@class="user"]/a[@class="avatar"]/@href')[0] != 'javascript:;' else ""
detail = ''.join(div.xpath('div[@class="txt"]/text()')).replace('\U0001f31f', '')
create_time = ''.join(div.xpath("div[@class='comm-meta']/span[@class='time']/text()"))
mysql(id, user_name, level, detail, create_time)
if __name__ == '__main__':
res = down(1,7091472)
func(res)