python爬取网页代码_爬取某网站写的python代码

代码如下:

import requests

from pyquery import PyQuery

import re

import os

import csv

import datetime

"""

说明:该代码是专门为爬取http://www.kgtmall.com.cn/商品而设计的。

使用方法:

1、在本地提前安装好python3的环境;

2、直接运行本代码;

3、运行本代码完后,会在当前目录生成一个result.csv文件,该文件里面就存了爬取该站点的商品信息

注意事项:在本代码运行期间,不能打开result.csv文件,因为这样程序就写不进去数据了;只能等本代码

全部运行结束后,才能打开esult.csv文件进行查看。

"""

def get_html_text(url):

"""

获取首页源代码

:param url:

:return:

"""

r = requests.get(url)

return r.text

def get_one_level_class(home_url):

"""

一级标题

母婴用品 http://www.kgtmall.com.cn/mall/list.php?catid=4

生活家居 http://www.kgtmall.com.cn/mall/list.php?catid=5

"""

html = get_html_text(home_url)

jpy = PyQuery(html)

items = jpy('.menu_title a')

for line in items:

jpy = PyQuery(line)

one_level_url = jpy('a').attr('href')

one_level_title = jpy('a').text()

yield one_level_url, one_level_title

def get_two_level_class(home_url):

"""

二级标题

母婴用品 营养辅食 http://www.kgtmall.com.cn/mall/search.php?catid=539

母婴用品 妈妈专区 http://www.kgtmall.com.cn/mall/search.php?catid=544

母婴用品 婴儿保健 http://www.kgtmall.com.cn/mall/search.php?catid=887

"""

for one_level_url, one_level_title in get_one_level_class(home_url):

jpy = PyQuery(one_level_url)

items = jpy('.selector_category li')

for line in items:

jpy = PyQuery(line)

two_level_url = jpy('a').attr('href')

two_level_title = jpy('a').text()

yield one_level_title, two_level_title, two_level_url

def get_pages(url):

"""

获取页数

:return:

"""

jpy = PyQuery(url)

pages = jpy('.pagination cite').text()

print('原pages:', pages)

try:

pages = int(re.findall('共.*?条/(.*)页', pages)[0])

except Exception as e:

print(e)

pages = 1

print('页码:', pages)

return pages

def get_three_level_class(home_url):

"""

三级标题

母婴用品 营养辅食 DHA http://www.kgtmall.com.cn/mall/search.php?catid=548

母婴用品 营养辅食 益生菌/初乳 http://www.kgtmall.com.cn/mall/search.php?catid=549

母婴用品 营养辅食 清火/开胃/驱虫 http://www.kgtmall.com.cn/mall/search.php?catid=550

"""

for one_level_title, two_level_title, two_level_url in get_two_level_class(home_url):

jpy = PyQuery(two_level_url)

items = jpy('.selector_category li')

for line in items:

jpy = PyQuery(line)

three_level_title = jpy('a').text()

three_level_url = jpy('a').attr('href')

catid = re.findall('http://www.kgtmall.com.cn/mall/search.php\?catid=(.*)', three_level_url)[0]

pages = get_pages(three_level_url)

# for index in range(1, 3):

for index in range(1, pages + 1):

three_level_url_by_xiaoliang = 'http://www.kgtmall.com.cn/mall/search.php?kw=&list=0&catid={}&order=10&minprice=&maxprice=&page={}'.format(

catid, index)

yield one_level_title, two_level_title, three_level_title, three_level_url_by_xiaoliang

def shop_title_and_url(home_url):

"""

商品标题和url

母婴用品 营养辅食 DHA 澳洲直邮 澳大利亚RIFOLD 儿童DHA90粒(一月以上适用) http://www.kgtmall.com.cn/mall/show.php?itemid=28089

母婴用品 营养辅食 益生菌/初乳 澳大利亚 Maxigenes美可卓 全脂高钙奶粉(蓝胖子)1kg 两罐装 http://www.kgtmall.com.cn/mall/show.php?itemid=23486

"""

for one_level_title, two_level_title, three_level_title, three_level_url_by_xiaoliang in get_three_level_class(home_url):

jpy = PyQuery(three_level_url_by_xiaoliang)

items = jpy('.list_img a')

for line in items:

jpy = PyQuery(line)

shop_url = jpy('a').attr('href')

shop_title = jpy('a img').attr('alt')

yield one_level_title, two_level_title, three_level_title, shop_title, shop_url

def get_shop_info(home_url, count):

for one_level_title, two_level_title, three_level_title, shop_title, shop_url in shop_title_and_url(home_url):

print('--排错:' + one_level_title, two_level_title, three_level_title, shop_title, shop_url)

jpy = PyQuery(shop_url)

price = jpy('.price').text()

# 条形码

bar_code = jpy('.bar_code dl dd p').text()

goods_detail = jpy('#content')

try:

guige = re.findall('规格:(.*)', goods_detail.text())[0]

except:

guige = '没有规格'

try:

chandi = re.findall('产地:(.*)', goods_detail.text())[0]

except:

chandi = '没有产地'

print(count, one_level_title, two_level_title, three_level_title, shop_title, bar_code, chandi, guige, price, shop_url)

row = ([one_level_title, two_level_title, three_level_title, shop_title, bar_code, chandi, guige, price, shop_url])

ppath = os.path.dirname(__file__)

csv_file = ppath + '/result.csv'

# newline是为了解决csv文件里面有多余的空行,encoding是为了解决写不进csv数据报字符集的报错

with open(csv_file, 'a', newline='', encoding='utf-8') as f:

writer = csv.writer(f)

writer.writerow(row)

count += 1

def main():

# 记录一下开始时间

start_time = datetime.datetime.now()

home_url = 'http://www.kgtmall.com.cn/'

# 当前代码路径

ppath = os.path.dirname(__file__)

csv_file = ppath + '/result.csv'

headers = (['一级分类', '二级分类', '三级分类', '商品名称', '条码', '产地', '规格', '价格', '商品链接'])

# newline是为了解决csv文件里面有多余的空行,encoding是为了解决写不进csv数据报字符集的报错

with open(csv_file, 'w', newline='', encoding='utf-8') as f:

writer = csv.writer(f)

writer.writerow(headers)

count = 1

get_shop_info(home_url, 1)

# 记录一下结束时间

end_time = datetime.datetime.now()

# 记录程序执行用时

timediff = end_time - start_time

print('总共用时{}秒\n'.format(str(timediff.seconds)))

print('全部商品已经按需求完成!!!')

if __name__ == '__main__':

main()

运行后,会在当前目录下生成个result.csv文件,内容如下:

bb

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值