总结自己第一个python爬虫

本文链接：https://blog.csdn.net/a731604348/article/details/78997657

刚开始学的python，做的第一个爬虫练习，有不足之处见谅

使用BeautifulSoup、requests两个库，爬的是58同城的转转，页面是随便点进去的

# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup
import requests

url = "http://gz.58.com/iphonesj/";


def getSecondCommodityInfo(url):
    webData = requests.get(url);
    soup = BeautifulSoup(webData.text,"lxml");

    # 注意如果有 nth-child(2) ,要改成 nth-of-type(2)
    commoditieImgs = soup.select("tbody > tr > td.img > a > img");
    titles = soup.select("tbody > tr > td.t > a");
    prices = soup.select("tbody > tr > td > span.pricebiao > span");
    sellersImgs = soup.select("tbody > tr > td.tc > div > p.img_attest > img");
    sellers = soup.select("tbody > tr > td.tc > div > p:nth-of-type(2)");

    dataList = []
    for commoditieImg,title,price,seller,sellersImg in zip(commoditieImgs,titles,prices,sellers,sellersImgs):
        data = {
            'commoditieImg':commoditieImg.get("lazy_src"),
            'title':title.get_text().strip(),
            'price':price.get_text(),
            'sellersImg': sellersImg.get("src"),
            'seller':seller.get_text().strip()
        }
        # print title.get_text()      #单独打印出来没问题
        print(data)     #打印字典或者列表都会出现乱码，python3不会
        print repr(data).decode("unicode–escape")     #加了这个就不会，或者使用 json.dumps(dict, encoding="UTF-8", ensure_ascii=False)
        dataList.append(repr(data).decode("unicode–escape"))

getSecondCommodityInfo1(url)

首先通过requests.get(url)，来获取响应后的网页，之后交给BeautifulSoup处理

通过soup.select()来获取标签内的内容，参数需要传入的是css selector格式的字符串

浏览器按F12，来观察网页的标签，找出自己需要的信息所在的标签，观察同类标签的相同之处，写出css selector语句

因为soup.select()获取的是list类型的，之后将其封装成字典类型，方便查看

这里有个小问题就是编码问题

直接print soup，pycharm控制台里中文是正常显示的，但是若print titles，或是在循环里print data，控制台显示的中文都是乱码，因为貌似是控制台的编码是ascii，python2.7打印字典或者列表，如果里面有中文就会出现乱码，即使有# -*- coding: UTF-8 -*-，或者有

import sys
reload(sys)
sys.setdefaultencoding("utf-8")

都显示不出中文，

print repr(data).decode("unicode–escape")

这样print才不会

将爬到的数据写到txt文件中，在根目录下创建test.txt

import codecs
def getSecondCommodityInfo2(url):
    webData = requests.get(url);
    soup = BeautifulSoup(webData.text, "lxml");

    commoditieImgs = soup.select("tbody > tr > td.img > a > img");
    titles = soup.select("tbody > tr > td.t > a");
    prices = soup.select("tbody > tr > td > span.pricebiao > span");
    sellersImgs = soup.select("tbody > tr > td.tc > div > p.img_attest > img");
    sellers = soup.select("tbody > tr > td.tc > div > p:nth-of-type(2)");

    # fo = open("test.txt", "wb")
    fo = codecs.open("test.txt", "wb", "utf-8")
    for commoditieImg, title, price, seller, sellersImg in zip(commoditieImgs, titles, prices, sellers,
                                                                   sellersImgs):
        fo.write('commoditieImg:' + commoditieImg.get("lazy_src") + "\n")
        fo.write('title:' + title.get_text().strip() + "\n")
        fo.write('price:' + price.get_text() + "\n")
        fo.write('sellersImg:' + sellersImg.get("src") + "\n")
        fo.write('seller:' + seller.get_text().strip() + "\n")
        fo.write("\n\n")

    fo.close()

getSecondCommodityInfo2(url);

这里的小问题是，如果不使用fo = codecs.open("test.txt", "wb", "utf-8")，而是直接使用fo = open("test.txt", "wb")打开文件并写入，也会出现中文乱码，使用 codecs.open()就不会了

若是一定要使用fo = open("test.txt", "wb")，则必须在前面加入

import sys
reload(sys)
sys.setdefaultencoding("utf-8")

这样就不会乱码了

刚学，基础不太牢固，如果有错，请帮忙指出，谢谢

关于编码问题，可以参考下大神的博客

https://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000/001431664106267f12e9bef7ee14cf6a8776a479bdec9b9000