12、外贸站采集代码

import scrapy
from scrapy.crawler import CrawlerProcess
import json

lists = []
class ListSpider(scrapy.Spider):
    name = "list"
    start_urls = [
        'http://www.techstart.org.uk/index.php?main_page=advanced_search_result&search_in_description=1&keyword=a&inc_subcat=0&sort=20a&page=1',
    ]


    def parse(self, response):

        for index in response.css('h3.itemTitle'):
            data = {
                'title': index.css('h3.itemTitle a::text').extract(),
                'url': index.css('h3.itemTitle a::attr(href)').extract()
            }
            lists.append(data)


        # page = len(response.css('div.navSplitPagesLinks a::attr(href)'))
        # next_page = response.css('div.navSplitPagesLinks a::attr(href)')[page - 1].extract()
        #
        # if next_page is not None:
        #     next_page = response.urljoin(next_page)
        # yield scrapy.Request(next_page, callback=self.parse)
        #
        with open('lists.json', 'w') as f:
            json.dump(lists, f)



process = CrawlerProcess()
process.crawl(ListSpider)
process.start()
import scrapy
from scrapy.crawler import CrawlerProcess
import json

infos = []

class InfoSpider(scrapy.Spider):

    name = "info"

    # with open('lists.json', 'r') as f:
    #     result = json.load(f)
    #     start_urls = [str(k['url'])[2:-2] for k in result]

    start_urls = [
        'http://www.techstart.org.uk/24-kilates-x-11-by-boris-bidjan-saberi-x-reebok-insta-pump-fury-blackwhite-various-sizes-p-5768.html',
    ]

    def parse(self, response):

        for index in response.css('div.centerColumn'):
            data = {
                'title': index.css('h1::text').extract(),
            }
            infos.append(data)


# process = CrawlerProcess()
# process.crawl(InfoSpider)
# process.start()
from bs4 import BeautifulSoup
import requests

urls = []
start_url = 'http://www.techstart.org.uk/index.php?main_page=advanced_search_result&search_in_description=1&keyword=a&inc_subcat=0&sort=20a&page=1'

 def get_list(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text,'lxml')

    titles = soup.select('h3.itemTitle')
    urls = soup.select('h3.itemTitle > a')

    for title,link in zip(titles,urls):
        data = {
            'title':title.get_text(),
            'url':link.get('href')
            }
        print(data)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值