一、顶点小说全站抓取(代码运行下去理论上能抓就行)
先获取所有分类链接
import requests
from lxml import etree
import pymysql
headers = {
'Host': 'www.23us.co',
'Referer': 'https://www.baidu.com/link?url=Kf8hqBe68nV-DipI3bExcurGlRpGxvq2j0kBbVOrk37&wd=&eqid=91dc6e33001b84210000000263787966',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
}
db = pymysql.connect(user='root',password='123456',db='顶点小说')
cursor = db.cursor()
def get_max_page(urls):
source = requests.get(urls).content.decode('gbk')
max_page = etree.HTML(source).xpath('//a[@class="last"]/text()')
return int(max_page[0])
def save_chapter_list(**kwargs):
book_name = kwargs.get('book_name','未取到')
book_id = kwargs.get('book_id','未取到')
status = kwargs.get('status',0)
chapter_list_url = kwargs.get('chapter_list_url','未取到')
sql = 'insert into book(book_name,book_id,status,chapter_list_url) values ("{}","{}","{}","{}")'.format(book_name,book_id,status,chapter_list_url)
cursor.execute(sql)
db.commit()
def save_book(**kwargs):
chapter_name = kwargs.get('chapter_name','未取到')
bid = kwargs.get('bid','未取到')
status = kwargs.get('status',0)
chapter_contents = kwargs.get('chapter_contents','未取到')
chapter_url = kwargs.get('chapter_url','未取到')
sql = 'insert into chapters(chapter_name,bid,status,chapter_contents,chapter_url) values ("{}","{}","{}","{}","{}")'.format(chapter_name,bid,status,chapter_contents,chapter_url)
cursor.execute(sql)
db.commit()
pass
def get_book_chapter_list(urls):
source = requests.get(urls).content.decode('gbk')
hrefs = etree.HTML(source).xpath('//tr/td[1]/a')
for i in hrefs:
hrefs = i.xpath('@href')[0]
book_name = i.xpath('text()')[0]
book_id = hrefs.split('/')[-1]
chapter_list_url = 'http://23us.co/html/{}/{}/'.format(book_id[:2],book_id)
save_chapter_list(book_name=book_name,book_id=book_id,status=0,chapter_list_url=chapter_list_url)
yield chapter_list_url
def get_contents(urls):
import re
source = requests.get(urls).content.decode('gbk')
demo = re.compile('http://23us\.co/html/\d+/(\d+)/\d+\.html')
book_id = demo.findall(source)[0]
title = etree.HTML(source).xpath('//h1/text()')[0]
contents = ''.join(etree.HTML(source).xpath('//dd[@id="contents"]/text()'))
return save_book(bid=book_id,chapter_name = title,chapter_contents=contents,chapter_url = urls)
def get_contents_title(urls):
source = requests.get(urls).content.decode('gbk')
hrefs = etree.HTML(source).xpath('//tr/td/a/@href')
if len(hrefs)>0:
for href in hrefs:
chapter_url = urls+href
get_contents(chapter_url)
for type in range(1,11):
first_page_url = 'http://23us.co/class/{}_1.html'.format(type)
max_page = get_max_page(first_page_url)
for page in range(1,max_page+1):
every_page_url = first_page_url.split('_')[0]+'_'+str(page)+'.html'
chapter_list_url = get_book_chapter_list(every_page_url)
for urls in chapter_list_url:
get_contents_title(urls)
二、京东商城三页数据抓取
import requests
from lxml import etree
import re
headers = {
'referer': 'https://search.jd.com/',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.56'
}
p = 1
for page in range(1,7,2):
source1 = requests.get('https://search.jd.com/s_new.php?keyword=%E8%8B%B9%E6%9E%9C&qrst=1&suggest=1.his.0.0&wq=%E8%8B%B9%E6%9E%9C&stock=1&pvid=652f33ee3c6940a8a8a1fbb376df10f3&page={}click=0'.format(page),headers=headers).text
demo = re.compile('wids:\'(.*?)\'')
num = demo.findall(source1)[0]
print(num)
source2 = requests.get('https://search.jd.com/s_new.php?keyword=%E8%8B%B9%E6%9E%9C&qrst=1&suggest=1.his.0.0&wq=%E8%8B%B9%E6%9E%9C&stock=1&pvid=652f33ee3c6940a8a8a1fbb376df10f3&page={}&s=26&scrolling=y&log_id=1669279334193.4075&tpl=1_M&isList=0&show_items={}'.format(page+1,num),headers=headers).text
divs1 = etree.HTML(source1).xpath('//div[@class="gl-i-wrap"]')
divs2 = etree.HTML(source2).xpath('//div[@class="gl-i-wrap"]')
divs = divs1+divs2
print(len(divs))
for div in divs:
price = div.xpath("div[2]/strong/i/text()")[0]
content = ''.join(div.xpath("div[3]/a/em//text()"))
print(price,content)
print('当前第{}页'.format(p))
p+=1
三、震坤行三页数据抓取
headers={
'origin': 'https://www.zkh.com',
'referer': 'https://www.zkh.com/search.html?keywords=%E5%8F%A3%E7%BD%A9&hasLinkWord=1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.56',
'cookie': 'AGL_USER_ID=f5353657-0d49-42b9-a71b-4befb2d69bca; _bl_uid=h8lqhag8p5R0aswX0nX4tqgxjjqv; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2218493e308593d1-0fdf9f5bef6ccb-7d5d5475-921600-18493e3085a716%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%22%24latest_utm_medium%22%3A%22sem1%22%2C%22%24latest_utm_campaign%22%3A%22C-%E5%93%81%E7%89%8C%22%2C%22%24latest_utm_content%22%3A%22%E6%A0%B8%E5%BF%83%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTg0OTNlMzA4NTkzZDEtMGZkZjlmNWJlZjZjY2ItN2Q1ZDU0NzUtOTIxNjAwLTE4NDkzZTMwODVhNzE2In0%3D%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%22%2C%22value%22%3A%22%22%7D%2C%22%24device_id%22%3A%2218493e308593d1-0fdf9f5bef6ccb-7d5d5475-921600-18493e3085a716%22%7D; anonymous_id=18493e308593d1-0fdf9f5bef6ccb-7d5d5475-921600-18493e3085a716; webSource=https%3A%2F%2Fwww.zkh.com%2F%3Futm_source%3Dbaidu%26utm_medium%3Dsem1%26utm_ter%3D%25E9%259C%2587%25E5%259D%25A4%25E8%25A1%258Cmro%26utm_content%3D%25E6%25A0%25B8%25E5%25BF%2583%26utm_campaign%3DC-%25E5%2593%2581%25E7%2589%258C%26sdclkid%3DAL2D152ibrDiAOqpALe_%26bd_vid%3D11204147592075616689; sensorsdata2015session=%7B%7D; citycode=%7B%22provinceName%22%3A%22%E5%9B%9B%E5%B7%9D%E7%9C%81%22%2C%22cityName%22%3A%22%E6%88%90%E9%83%BD%E5%B8%82%22%2C%22provinceCode%22%3A510000%2C%22cityCode%22%3A510100%7D; Hm_lvt_c9156633fc15595028b4d81a3571a23f=1668928445,1669356182; utmStore=%7B%22flow_type%22%3A%22%E5%85%8D%E8%B4%B9%22%2C%22%24utm_source%22%3A%22baidu%22%2C%22%24utm_medium%22%3A%22sem1%22%2C%22%24utm_content%22%3A%22%E6%A0%B8%E5%BF%83%22%2C%22%24utm_campaign%22%3A%22C-%E5%93%81%E7%89%8C%22%7D; p_pub_key=MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQC36XGQaO8cG2ifwZNixxe7HVyqlzELwo2DC+LwgvE0Q8rjwLXxSucPAJrYnA3C3c8/moKiVHEs9U4rciZv4jW2FyG6ivXRnHouHpSVjl83LfYbL2QwXyDurSfGSelPDgC5QCs11TgF26N3FEa4f/kvypcEfNIgkK0MHBBK7Gp4cwIDAQAB; p_pub_gr=1669356018056; zaf_ukey=d368446d79fd4ebdb66ed5c753f50530; Hm_lpvt_c9156633fc15595028b4d81a3571a23f=1669357141; JSESSIONID=E307A9B5A975766DC4EA2527E66B7FD3'
}