前言
python数据抓取,得到多页的图书各个维度的数据,并对某些字符串做了数据处理,然后用pandas模块将数据存储为excel文件。
1.源代码
代码如下:
import requests
from bs4 import BeautifulSoup
import pandas as pd
Cookie = '__permanent_id=20220513170904104321665604699593064; __ozlvd=1652433083; dest_area=country_id%3D9000%26province_id%3D111%26city_id%20%3D0%26district_id%3D0%26town_id%3D0; MDD_permanent_id=20220514110040183284142509428454779; MDD_province_str=%E5%8C%97%E4%BA%AC; MDD_province_id=111; MDD_city_str=%E5%8C%97%E4%BA%AC%E5%B8%82; MDD_city_id=1; MDD_area_str=%E4%B8%9C%E5%9F%8E%E5%8C%BA; MDD_area_id=1110101; secret_key=efcb6c415140cae4f4c0a166ed773e7d; ddscreen=2; pos_6_start=1653042630185; pos_6_end=1653042630438; bind_cust_third_id=ocil5uKns4SFMnjuJsmygDnUQfWQ; tx_open_id=oqh4kuCwpq6-aPSHG-lYsnfn-3DM; tx_nickname=UmV0cmVhdGluRw==; tx_figureurl=https:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36',
'Cookie': Cookie}
def dangdang(page):
url = "http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-year-2021-0-1-{}".format(page)
# http:
res = requests.get(url, headers=headers)
res.raise_for_status()
res.encoding = res.apparent_encoding