58同城 爬取省市添加到数据库

该博客介绍了如何使用Python的Selenium库爬取58同城网站的城市信息,并利用PyMySQL将数据存储到MySQL数据库中。通过遍历页面上的省份和城市元素,提取城市名称和链接,然后调用数据库插入函数,实现数据持久化。在遇到异常时,程序能够进行回滚操作,确保数据一致性。
摘要由CSDN通过智能技术生成

方法一:

import time
import pymysql
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

db = pymysql.connect(host='1', user='1', password='1', database='1')
cursor = db.cursor()


def insert_data(province, city, url, logger, db, cursor):
    try:
        sql = 'insert into 58city_info(`province`,`city`,`url`)values (%s,%s,%s)'
        cursor.execute(sql, (province, city, url))
        db.commit()
    except Exception as e:
        print(e)
        print('connect_mysql 34line')
        db.rollback()


def br():
    chrome_options = Options()
    browser = webdriver.Chrome(options=chrome_options)
    browser.get('https://www.58.com/changecity.html?fullpath=0&PGTID=0d100000-0005-d7dc-68c3-6a71c706285a&ClickID=2')
    browser.set_page_load_timeout(15)
    browser.set_script_timeout(15)
    all_city(browser)
    browser.quit()


def all_city(browser):
    lists_1 = browser.find_elements_by_xpath('//*[@class="content-province"]')
    for i in lists_1:
        province_1 = i.find_element_by_xpath('.//*[@class="content-province-title"]').text
        city_1 = i.find_elements_by_xpath('.//div[@class="content-cities"]')
        for j in city_1:
            city_2 = [city.text for city in j.find_elements_by_xpath('.//a[@class="content-city"]')]
            link = [link.get_attribute('href') for link in j.find_elements_by_xpath('.//a[@class="content-city"]')]
            for k in range(len(city_2)):
                city = city_2[k]
                link_ = link[k]
                print(city, link_)
                insert_data(province=province_1,city=city_2,url=link,logger='',db=db, cursor=cursor)

if __name__ == '__main__':
    br()

关于for k in city_2:

city_3=city_2[k] 会出错的原因

for city in city_2:
    city = city
    index_ = city_2.index(city)
    link_ = link[index_]
    print(city, link_)
            for k in range(len(city_2)):
                city = city_2[k]
                link_ = link[k]
                print(city, link_)

两种方法可以解决

方法二:

import time
import pymysql
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

db = pymysql.connect(host='1', user='1', password='1', database='1')
cursor = db.cursor()


def insert_data(province, city, url, db, cursor):
    try:
        sql = 'insert into 58city_info(`province`,`city`,`url`)values (%s,%s,%s)'
        cursor.execute(sql, (province, city, url))
        db.commit()
    except Exception as e:
        print(e)
        print('connect_mysql 34line')
        db.rollback()


def br():
    chrome_options = Options()
    browser = webdriver.Chrome(options=chrome_options)
    browser.get('https://www.58.com/changecity.html?fullpath=0&PGTID=0d100000-0005-d7dc-68c3-6a71c706285a&ClickID=2')
    browser.set_page_load_timeout(15)
    browser.set_script_timeout(15)
    all_city(browser)
    browser.quit()
    cursor.close()
    db.close()


def all_city(browser):
    lists_1 = browser.find_elements_by_xpath('//*[@class="content-province"]')
    for i in lists_1:
        province_1 = i.find_element_by_xpath('.//*[@class="content-province-title"]').text
        city_1 = i.find_element_by_xpath('.//div[@class="content-cities"]')
        all_citys = city_1.find_elements_by_xpath(".//a")
        for j in all_citys:
            city = j.text
            link = j.get_attribute("href")
            insert_data(province=province_1, city=city, url=link, db=db, cursor=cursor)


if __name__ == '__main__':
    br()

关于数据库建表

1.给字段加索引

给city字段加唯一的索引,避免重复。

其他值默认索引

2.建立数据库表

 爬取成功

 

 

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值