import scrapy
from scrapy.crawler import CrawlerProcess
import json
lists = []
classListSpider(scrapy.Spider):
name = "list"
start_urls = [
'http://www.techstart.org.uk/index.php?main_page=advanced_search_result&search_in_description=1&keyword=a&inc_subcat=0&sort=20a&page=1',
]
defparse(self, response):for index in response.css('h3.itemTitle'):
data = {
'title': index.css('h3.itemTitle a::text').extract(),
'url': index.css('h3.itemTitle a::attr(href)').extract()
}
lists.append(data)
# page = len(response.css('div.navSplitPagesLinks a::attr(href)'))# next_page = response.css('div.navSplitPagesLinks a::attr(href)')[page - 1].extract()## if next_page is not None:# next_page = response.urljoin(next_page)# yield scrapy.Request(next_page, callback=self.parse)#with open('lists.json', 'w') as f:
json.dump(lists, f)
process = CrawlerProcess()
process.crawl(ListSpider)
process.start()
import scrapy
from scrapy.crawler import CrawlerProcess
import json
infos = []
classInfoSpider(scrapy.Spider):
name = "info"# with open('lists.json', 'r') as f:# result = json.load(f)# start_urls = [str(k['url'])[2:-2] for k in result]
start_urls = [
'http://www.techstart.org.uk/24-kilates-x-11-by-boris-bidjan-saberi-x-reebok-insta-pump-fury-blackwhite-various-sizes-p-5768.html',
]
defparse(self, response):for index in response.css('div.centerColumn'):
data = {
'title': index.css('h1::text').extract(),
}
infos.append(data)
# process = CrawlerProcess()# process.crawl(InfoSpider)# process.start()
from bs4 import BeautifulSoup
import requests
urls = []
start_url = 'http://www.techstart.org.uk/index.php?main_page=advanced_search_result&search_in_description=1&keyword=a&inc_subcat=0&sort=20a&page=1'defget_list(url):
response = requests.get(url)
soup = BeautifulSoup(response.text,'lxml')
titles = soup.select('h3.itemTitle')
urls = soup.select('h3.itemTitle > a')
for title,link in zip(titles,urls):
data = {
'title':title.get_text(),
'url':link.get('href')
}
print(data)