京东爬虫,数据转为elasticsearch以及csv文件

import requests
import parsel
import csv
from elasticsearch import Elasticsearch

es = Elasticsearch(["https://localhost:9200"], basic_auth=('elastic', 'hw-FxQTckOuswZ1jRikA'), verify_certs=False,
                   request_timeout=3600)
print(es)
INDEX_NAME = "jd-produict-list"
mappings = {
    "mappings": {

        "properties": {
            "name": {
                "type": "text"
            },
            "price": {
                "type": "text"
            },
            "link": {
                "type": "text",
                "index": "false"
            },
            "shop": {
                "type": "text",
            }
        }

    }
}
res1 = es.options(ignore_status=404).indices.delete(index=INDEX_NAME)
res = es.indices.create(index=INDEX_NAME, body=mappings)
print(res)
url = ("https://api.m.jd.com/?appid=search-pc-java&functionId=pc_search_s_new&client=pc&clientVersion=1.0.0&t"
       "=1713168142494&body=%7B%22keyword%22%3A%22java%22%2C%22pvid%22%3A%22555ce3e0d8264f2d91d2a39ce764254d%22%2C"
       "%22page%22%3A%222%22%2C%22s%22%3A%2228%22%2C%22scrolling%22%3A%22y%22%2C%22log_id%22%3A%221713167867206.8212"
       "%22%2C%22tpl%22%3A%221_M%22%2C%22isList%22%3A0%2C%22show_items%22%3A%22%22%7D&loginType=3&uuid=143920055"
       ".248095561.1709373914.1713090587.1713167839.7&area=22_2047_2057_41318&h5st=20240415160222499"
       "%3Bqyteemmthhkli668%3Bf06cc"
       "%3Btk03wba031cc818n7XIljX0jRqe6t8PhrN26jxhwORrkGFMCJkdMI_mhV5hE3Eush3x6BvzotE7yd5skQSLeYJsl8Zj9"
       "%3B9c9e8a2fcf3b817a39ed9a4537423857ecb12ff01dc08a86a29dc756954c78f5%3B4.3%3B1713168142499"
       "%3Bdd5cf478e70f73f34d265642738e316bc0f9a6da94374905c9970322b82790b67a374a2bbdafb5ad993bdd6053b7597cc886ae4bf8fcedcba6c5956c447465c0dcf408247a56be744d7e2067f7e49bd48f0f0d97e020a9b2ea47e8aaaceb203492ca2c9ea194c67d83671184546ff6dd15a27dc8bfe455f99ba707297b8a174bd3940c48dd5b3c0f7e1ea793da2b553ec060d55ddbbfe008ada86e0f9f49e84afc6a284629f4b771d9fc7b1e69168cae305ee52472620d36f52a6e32f690aa123cb190e570d18b41455176fc4307d0da7538ec59332f0b693e2e5e8d0d437b972ea34cbbd4da2d3e28d51638c109f62b5750616bdd1d0efe7a9efb79f355cc00eb64fc78ffa7b8f7d07b700fd6ff08c2e9485c68fcd3be97322443c50a80c175a042240464fcb44c1077abb2be053935&x-api-eid-token=jdd03DUKVBH2K5ZMO5GIJKRDXRXZHDKA3QO7X55M3HVSZIRJIANBNEILJHSGX726HOXQT5BJVGTOA3KL6BFN37PF5YWGZK4AAAAMO4DBLQKAAAAAADP3HHRAOTIAZJUX")
headers = {
    "referer": "https://search.jd.com/",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 "
                  "Safari/537.36 Edg/123.0.0.0",
    "Cookie": "__jdu=248095561; shshshfpa=e1e8c8c2-76d0-4515-f460-a4fb1af562b4-1713011253; "
              "shshshfpx=e1e8c8c2-76d0-4515-f460-a4fb1af562b4-1713011253; areaId=22; PCSYCityID=CN_510000_511800_0; "
              "TrackID"
              "=1_wuJAB_unk5LQz1rtzmKLfu4IlhRZrrNVkeSWfMLiU3HceCELY2pe155SiHjqI6hFAgmyWP9LaUNWo8HWH_DtCC2M4_2sPpCd3OtUR5aqeYaD8ALkWPAwjYX6pju75zv; thor=D08D1C29121373231FAA3079BB16EA76828F570B265DB94BE63BBAFE4E9A2EBAF6F29D511311DF57D17852B20F0857963E0886A5B9688A67FA16B14863AF28BB29C9C0D30A794D1DC592855F3FDA864D050F35B6283D5D91BF90A719AA88939B0AD712B88708E605113C249D442ED475780A129AC8D91F68373EC410C427327110921A19D5862C849353569BDC0E09B5; flash=2_niWzYUL8kZdBMSb_uOFDzZZV0FUKd8ZwpUP9gk-3Y4MjZTJuSO_tabUskOfoLN0S6sBru6WCszuDzU1QcAP8iTiFzHjJpAb744VJxrGqg5k*; pinId=JJO6sS-xQWyafxOhHQLy0Q; pin=maohengheng; unick=maohenghenghhc; ceshi3.com=203; _tp=s0REgHirLZTaxYQ0SEDyYw%3D%3D; _pst=maohengheng; ipLoc-djd=22-2047-2057-41318; unpl=JF8EAJdnNSttXhwDVxhRGxpDG11XW11bQ0QLOzBQAFRdSVVXGQIdQRJ7XlVdWBRKEh9vYxRUWVNPUg4bBSsSEXteU11bD00VB2xXXAQDGhUQR09SWEBJJVlQXl4ITxcFZ2A1ZF5Ye1QEKwEeGxFDWVBZXThKJwRfVzVUXVBDVQcrAysTIAkJCFteDUwTBCJkAF1cUE9QAhsyGiIT; __jdv=76161171|haosou-search|t_262767352_haosousearch|cpc|5512151796_0_7ffc2b19ba1241b9b9efde8430c317b3|1713167838925; 3AB9D23F7A4B3CSS=jdd03DUKVBH2K5ZMO5GIJKRDXRXZHDKA3QO7X55M3HVSZIRJIANBNEILJHSGX726HOXQT5BJVGTOA3KL6BFN37PF5YWGZK4AAAAMO4DBLQKAAAAAADP3HHRAOTIAZJUX; jsavif=1; __jda=143920055.248095561.1709373914.1713090587.1713167839.7; __jdc=143920055; shshshfpb=BApXeTmLK4-tAHmbShJeroUapzYyhM9NuBlFAFXxu9xJ1MuVBB4C2; __jdb=143920055.3.248095561|7.1713167839; 3AB9D23F7A4B3C9B=DUKVBH2K5ZMO5GIJKRDXRXZHDKA3QO7X55M3HVSZIRJIANBNEILJHSGX726HOXQT5BJVGTOA3KL6BFN37PF5YWGZK4"
}
with open('京东.csv', encoding='utf-8', newline='', mode='w') as f:
    csv_writer = csv.writer(f)
    csv_writer.writerow(['name', 'price', 'shop', 'link'])
response = requests.get(url=url, headers=headers, verify=False)
html_data = response.text
select = parsel.Selector(html_data)
#//li[@class="gl-item"]/div[@class="gl-i-wrap"]
goodlist = select.xpath('//li[@class="gl-item"]')
for li in goodlist:
    price = li.xpath('string(.//div[@class="gl-i-wrap"]/div[@class="p-price"])').get("").strip()
    name = li.xpath('string(.//div[@class="gl-i-wrap"]/div[@class="p-name p-name-type-2"]/a/em)').get("").strip()
    link = li.xpath('string(.//div[@class="gl-i-wrap"]/div[@class="p-name p-name-type-2"]/a/@href)').get("").strip()
    shop = li.xpath('string(.//div[@class="gl-i-wrap"]/div[@class="p-shop"])').get("").strip()
    id = li.xpath('string(.//div[@class="gl-i-wrap"]/div[@class="p-shop"])').get("").strip()
    with open('京东.csv', encoding='utf-8', newline='', mode='a') as f:
        csv_writer = csv.writer(f)
        csv_writer.writerow([name, price, shop, link])
        e1 = {
            'name': name,
            'price': price,
            "link": link,
            'shop': shop,
        }
        res1 = es.index(index=INDEX_NAME, document=e1)
        print(res1)
    print(shop)

注意替换Cookie

  • 2
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值