58同城爬取省市添加到数据库

_阿衡_

于 2021-12-30 11:11:38 发布

阅读量850

点赞数 1

文章标签：数据库 python chrome

本文链接：https://blog.csdn.net/weixin_45910860/article/details/122231557

版权

该博客介绍了如何使用Python的Selenium库爬取58同城网站的城市信息，并利用PyMySQL将数据存储到MySQL数据库中。通过遍历页面上的省份和城市元素，提取城市名称和链接，然后调用数据库插入函数，实现数据持久化。在遇到异常时，程序能够进行回滚操作，确保数据一致性。

摘要由CSDN通过智能技术生成

方法一：

import time
import pymysql
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

db = pymysql.connect(host='1', user='1', password='1', database='1')
cursor = db.cursor()


def insert_data(province, city, url, logger, db, cursor):
    try:
        sql = 'insert into 58city_info(`province`,`city`,`url`)values (%s,%s,%s)'
        cursor.execute(sql, (province, city, url))
        db.commit()
    except Exception as e:
        print(e)
        print('connect_mysql 34line')
        db.rollback()


def br():
    chrome_options = Options()
    browser = webdriver.Chrome(options=chrome_options)
    browser.get('https://www.58.com/changecity.html?fullpath=0&PGTID=0d100000-0005-d7dc-68c3-6a71c706285a&ClickID=2')
    browser.set_page_load_timeout(15)
    browser.set_script_timeout(15)
    all_city(browser)
    browser.quit()


def all_city(browser):
    lists_1 = browser.find_elements_by_xpath('//*[@class="content-province"]')
    for i in lists_1:
        province_1 = i.find_element_by_xpath('.//*[@class="content-province-title"]').text
        city_1 = i.find_elements_by_xpath('.//div[@class="content-cities"]')
        for j in city_1:
            city_2 = [city.text for city in j.find_elements_by_xpath('.//a[@class="content-city"]')]
            link = [link.get_attribute('href') for link in j.find_elements_by_xpath('.//a[@class="content-city"]')]
            for k in range(len(city_2)):
                city = city_2[k]
                link_ = link[k]
                print(city, link_)
                insert_data(province=province_1,city=city_2,url=link,logger='',db=db, cursor=cursor)

if __name__ == '__main__':
    br()

关于for k in city_2:

city_3=city_2[k] 会出错的原因

for city in city_2:
    city = city
    index_ = city_2.index(city)
    link_ = link[index_]
    print(city, link_)

            for k in range(len(city_2)):
                city = city_2[k]
                link_ = link[k]
                print(city, link_)

两种方法可以解决

方法二：

import time
import pymysql
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

db = pymysql.connect(host='1', user='1', password='1', database='1')
cursor = db.cursor()


def insert_data(province, city, url, db, cursor):
    try:
        sql = 'insert into 58city_info(`province`,`city`,`url`)values (%s,%s,%s)'
        cursor.execute(sql, (province, city, url))
        db.commit()
    except Exception as e:
        print(e)
        print('connect_mysql 34line')
        db.rollback()


def br():
    chrome_options = Options()
    browser = webdriver.Chrome(options=chrome_options)
    browser.get('https://www.58.com/changecity.html?fullpath=0&PGTID=0d100000-0005-d7dc-68c3-6a71c706285a&ClickID=2')
    browser.set_page_load_timeout(15)
    browser.set_script_timeout(15)
    all_city(browser)
    browser.quit()
    cursor.close()
    db.close()


def all_city(browser):
    lists_1 = browser.find_elements_by_xpath('//*[@class="content-province"]')
    for i in lists_1:
        province_1 = i.find_element_by_xpath('.//*[@class="content-province-title"]').text
        city_1 = i.find_element_by_xpath('.//div[@class="content-cities"]')
        all_citys = city_1.find_elements_by_xpath(".//a")
        for j in all_citys:
            city = j.text
            link = j.get_attribute("href")
            insert_data(province=province_1, city=city, url=link, db=db, cursor=cursor)


if __name__ == '__main__':
    br()