from bs4 import BeautifulSoup
import requests
import time
import json
class Spider():
def __init__(self):
self.headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0',
'Cookie':'userid360_xml=DA351ED943B7D514950F847645DEB635; time_create=1587472887135; f=n; commontopbar_new_city_info=1%7C%E5%8C%97%E4%BA%AC%7Cbj; commontopbar_ipcity=mm%7C%E8%8C%82%E5%90%8D%7C0; userid360_xml=DA351ED943B7D514950F847645DEB635; time_create=1587473108637; f=n; id58=c5/nn13Q11MweK6PMRylAg==; 58home=bj; city=bj; 58tj_uuid=7a6baace-998a-4b1b-8d83-4782fb6c0b66; new_uv=3; als=0; wmda_uuid=9dd05d0a6a6044021be1a1c6bab10d71; wmda_new_uuid=1; wmda_visited_projects=%3B11187958619315%3B1409632296065; xxzl_deviceid=5NSN2EJuLzsmCK8%2B7ZToyx1eJnR68wAN23F3N%2B%2FL9iJZc%2BYsmps2L%2BNMPAsT1gtT; gr_user_id=81fd7291-1408-4a46-8da0-5b6c7eed6f6b; Hm_lvt_3bb04d7a4ca3846dcc66a99c3e861511=1584880886; Hm_lvt_e15962162366a86a6229038443847be7=1584880886; GA_GTID=0d3628eb-002a-7b7c-5add-41dea874a818; _ga=GA1.2.1565386338.1584881078; _gid=GA1.2.1326103577.1584881078; Hm_lvt_e2d6b2d0ec536275bb1e37b421085803=1584881379; final_history=41582026010401; ppStore_fingerprint=F794BAF2D2A55929EC6A17AC7BEB2196216B781C04FD13BE%EF%BC%BF1584881381109; sessionid=5ab38641-633f-4619-85c4-65ae47bef1e6; Hm_lpvt_3bb04d7a4ca3846dcc66a99c3e861511=1584945621; Hm_lpvt_e15962162366a86a6229038443847be7=1584945621; f=n; xzfzqtoken=JprXuyYTloLFy4BngTOYF%2F%2B2mutsItJBcFI5zzLh7sI19IE7oXs5JkMDG6tsfN%2Bxin35brBb%2F%2FeSODvMgkQULA%3D%3D; zscom=%5B0%2C0%2C0%2C0%5D; Hm_lpvt_e2d6b2d0ec536275bb1e37b421085803=1584881379; commontopbar_new_city_info=1%7C%E5%8C%97%E4%BA%AC%7Cbj; new_session=1; utm_source=; spm=; init_refer=https%253A%252F%252Fcallback.58.com%252Fantibot%252Fverifycode%253FserialId%253Db607ceb049f6a879564cdce058aaeceb_c0901def04c343378eceba20f8fac405%2526code%253D21%2526sign%253Dd97c2ace4d7cc3274bbf39d667b15f06%2526namespace%253Dhuangyelistpc%2526url%253Dhttps%25253A%25252F%25252Finfo5.58.com%25252Fbj%25252Fpbdn%25252F0%25252F; commontopbar_ipcity=mm%7C%E8%8C%82%E5%90%8D%7C0; wmda_session_id_11187958619315=1584948561215-5519ce6b-e294-476a; xxzl_cid=2e5a615f7295460593a6e98a49ea5745; xzuid=9d64ff8d-b9c4-49dd-91dd-4836ddadb433'
}
self.urls=['https://bj.58.com/pbdn/0/pn{}'.format(str(i))for i in range(1,12)]
self.s=requests.Session()
self.url_list=[]
self.href_list=[]
self.things_information=[]
def __get_urls(self):
'''
获取每一页含有商品链接的字符串
'''
for url in self.urls:
html=self.s.get(url,headers=self.headers)
soup=BeautifulSoup(html.text,'lxml')
a_page_urls=soup.select('tr.ac_item > td[colspan="3"]:nth-child(1) > a:nth-child(1)')
self.url_list.append(a_page_urls)
time.sleep(20)#每一次请求间隔20秒
print('success')
def __get_src(self):
'''
从每一个字符串中获取商品链接
'''
for a_page_urls in self.url_list:
for href in a_page_urls:
href_need=href.get('href')
self.href_list.append(href_need)
def __get_things_information(self):
'''
从商品链接中获取我们要的信息
'''
for href in self.href_list:
html=self.s.get(href)
soup=BeautifulSoup(html.text,'lxml')
sort=soup.select('.crb_a_2')
title=soup.select('.detail-title__name')
date=soup.select('div.detail-title__info__text:nth-child(1)')
money=soup.select('.infocard__container__item__main__text--price')
place=soup.select('div.infocard__container__item:nth-child(2) > div:nth-child(2) > a:nth-child(1)')
try:
dict={'sort':sort[0].get_text(),
'title':title[0].get_text(),
'date':date[0].get_text(),
'money':money[0].get_text(),
'place':place[0].get_text()
}
except:
pass
else:
self.things_information.append(dict)
def __write_information(self):
'''
把信息写入文件
'''
with open('data.json','w',encoding='utf-8') as file:
file.write(json.dumps(self.things_information,indent=2,ensure_ascii=False))
def go(self):
print('start')
self.__get_urls()
self.__get_src()
self.__get_things_information()
self.__write_information()
print('end')
spider=Spider()
spider.go()
注意:'sort':sort[0].get_text() sort的类型是列表。
td[colspan="3"] colspan是属性名,具体值要用双引号。
请求太快会遭遇反爬,封IP,暂时还不会代理IP池,只能设置时间间隔了。
阅读量是JS控制,还不会爬取。