1.爬取结果如下:
2.代码解读:
①首先自定义一个爬取类:
import requests
from bs4 import BeautifulSoup
class spider:
②构造函数为:
page:表示抓取页数
self.url = 'https://search.jd.com/Search?keyword=%E8%A3%A4%E5%AD%90&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&offset=5&wq=%E8%A3%A4%E5%AD%90&page=' + str(page)
self.headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
③初始化抓取页面
通过encoding函数设置解码格式,防止出现乱码
# 得到每一页的网页源码
def get_html(self):
res = requests.get(self.url, headers=self.headers)
res.encoding='utf-8'
html = res.text
#print(html)
return html
④编写抓取函数
a.通过BeautifulSoup()函数来出使得抓取内容标准化;
b.通过find_all(标签名,类名)函数抓取指定类的标签中的内容;
c.通过select()函数选择指定子标签的内容
def get_information(self):
html = self.get_html()
#html.encode('utf-8')
soup = BeautifulSoup(html, 'lxml')
divs = soup.find_all("div", class_='p-img') #图片
divs_prices = soup.find_all("div", class_='p-price') #价格
divs_name=soup.find_all("div",class_="p-name") #商品名
divs_content=soup.find_all("em") #商品描述
divs_all=soup.find_all("li",class_="gl-item")
for div in divs_all:
name=div.find("a",target="_blank").get("title")
content1=div.select("em:nth-of-type(1)")[1].text
#content=div.find("em").text
img1=div.find("img").get('source-data-lazy-img')
img2=div.select("img:nth-of-type(1)")[1].get("data-lazy-img")
price=div.find("i").text
pid=div.get("data-pid")
print("商品名称:", name)
print("商品pid:",pid)
print("商品描述:", content1)
print("图片1:", img1)
print("图片2:",img2)
print("价格:", price)
print("\n")
⑤最后一个内部调用函数:
def main(self):
self.get_information()
3.源代码:
import requests
from bs4 import BeautifulSoup
class spider:
self.url = 'https://search.jd.com/Search?keyword=%E8%A3%A4%E5%AD%90&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&offset=5&wq=%E8%A3%A4%E5%AD%90&page=' + str(page)
self.headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
def get_html(self):
res = requests.get(self.url, headers=self.headers)
res.encoding='utf-8'
html = res.text
#print(html)
return html
def main(self):
self.get_information()
def get_information(self):
html = self.get_html()
#html.encode('utf-8')
soup = BeautifulSoup(html, 'lxml')
divs = soup.find_all("div", class_='p-img') #图片
divs_prices = soup.find_all("div", class_='p-price') #价格
divs_name=soup.find_all("div",class_="p-name") #商品名
divs_content=soup.find_all("em") #商品描述
divs_all=soup.find_all("li",class_="gl-item")
for div in divs_all:
name=div.find("a",target="_blank").get("title")
content1=div.select("em:nth-of-type(1)")[1].text
#content=div.find("em").text
img1=div.find("img").get('source-data-lazy-img')
img2=div.select("img:nth-of-type(1)")[1].get("data-lazy-img")
price=div.find("i").text
pid=div.get("data-pid")
print("商品名称:", name)
print("商品pid:",pid)
print("商品描述:", content1)
print("图片1:", img1)
print("图片2:",img2)
print("价格:", price)
print("\n")
if __name__ == '__main__':
threads = []
for i in range(1, 2):
page = i * 2 - 2 # 这里每一页对应的都是奇数,但是ajax的请求都是偶数的,所有在获取扩展的网页时都要用page+1转换成偶数
t = threading.Thread(target=spiders(page).main, args=[])
threads.append(t)
for t in threads:
t.start()
t.join()