mysql 存储块输出日志_多线程爬取房天下数据,并且存储到mysql(增加爬取日志输出模块)...

1 #-*- coding: utf-8 -*-

2

3 importrequests,time,urllib.request,os,re,xlwt4 importthreading,random,threadpool5 importpymongo,pymysql,logging6 from multiprocessing importProcess7 from lxml importetree8 from pymongo importMongoClient9 importlog10

11 user_agent_list =[ \12 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",\13 "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \14 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \15 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \16 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \17 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \18 "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \19 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \20 "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \21 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \22 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \23 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \24 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \25 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \26 "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \27 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \28 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \29 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"

30

31 ]32 url = 'http://newhouse.sz.fang.com/house/s/b911/?ctm=1.sz.xf_search.page.9'

33 workbook =xlwt.Workbook()34 sheet = workbook.add_sheet("Sheet Name")35

36 #sheet.write(0, 2, 'foobar')# row, column, value

37

38 #workbook.save("foobar.xls")

39

40 #links = re.findall('"((http|ftp)s?://.*?)"', str(html.text))#获取网站所有url的正则表达式

41

42 #client = MongoClient('localhost',int(27017))#链接数据库

43

44 classFt(object):45 defsave_mysql(self,d_t):46 for i ind_t:47 for ii ini:48 lk =str(i[0])49 ad = str(i[1])50 ade = str(i[2])51 pe = str(i[3])52 phe = str(i[4])53 conn = pymysql.connect(host='192.168.191.1', user='root', passwd='123456789', db='data', port=3306,54 charset='utf8')55 cur = conn.cursor() #获取一个游标

56 sql = '''INSERT INTO ftx(link,adr,adress,price,phone)VALUES("%s","%s","%s","%s","%s")''' %(lk, ad, ade, pe, phe)57 cur.execute(sql)58 data =cur.fetchall()59 cur.close() #关闭游标

60 conn.commit() #事务提交

61 conn.close() #释放数据库资源

62

63 defget_data(self,url):64 headers={}65 addr =[]66 url_2 = 'http://newhouse.gz.fang.com/house/s/b9'+ str(url) + '/?ctm=1.gz.xf_search.page.6'

67 url_1 = 'http://newhouse.sz.fang.com/house/s/b9'+ str(url) + '/?ctm=1.sz.xf_search.page.9'

68 headers['User-Agent'] =random.choice(user_agent_list)69 try:70 html = requests.get(url_2, headers=headers)71 html.encoding = 'gbk'

72 if html.status_code == 200:73 log.kk('下载网页数据成功')74 else:75 print('下载失败!!!')76 exceptrequests.exceptions.ReadTimeout as e:77 log.gg.kk(e)78 selector =etree.HTML(str(html.text))79 links = selector.xpath('//div[@class="nlc_img"]/a/@href')80 addrnames = selector.xpath('//div[@class="nlcd_name"]/a/text()')81 for i inaddrnames:82 addr.append(i.strip())83 addrs = selector.xpath('//div[@class="address"]/a/@title')84 prices = selector.xpath('//div[@class="nhouse_price"]/span/text()')85 tels = selector.xpath('//div[@class="tel"]/p/text()')86 r =list(zip(links, addr, addrs, prices, tels))87 print(r)88 self.save_mysql(r)89

90 defsave_data(self,get_dat):91 client = MongoClient('localhost', int(27017)) #链接mongodb数据库,预留的接口可忽略·

92

93 deflog(self):94 logging.basicConfig(level=logging.DEBUG,95 format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',96 datefmt='%a, %d %b %Y %H:%M:%S',97 filename='myapp.log',98 filemode='w'

99 logging.debug('This is debug message')100 logging.info('This is info message')101 logging.warning('This is warning message')102

103 if __name__=="__main__":104 dt =Ft()105 gd =dt.get_data106 pool = threadpool.ThreadPool(50)107 reqs = threadpool.makeRequests(gd,range(2))108 [pool.putRequest(req) for req inreqs]109 pool.wait()110

111 下面附上建表代码:112

113 create table ftx(114 id int notnull auto_increment,115 link varchar(100) notnull,116 adr varchar(100) notnull,117 adress varchar(100) notnull,118 price varchar(100) notnull,119 phone varchar(100) notnull,120 PRIMARY KEY (id )121 );

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值