基于规则嵌入的论文比对系统(五)

子空间sentence的序列化处理

word2vec生成词向量

  根据论文的所有abstracts的内容,生成词向量,存到一个txt文件里,便于后续加载和处理。

29568 256
the 0.035862766 0.05651933 -0.13845906 -0.2546235 0.073469386 -0.33557066 0.038079146 -0.11025188 -0.087536 -0.13868436 0.1831769 0.0932296 0.024537407 0.0023498852 0.011696546 -0.034453694 -0.11605296 0.21995386 -0.018928478 -0.008253324 0.017743122 0.03907303 -0.05928613 -0.0106651345 -0.093619585 0.06805104 -0.3242432 0.048463102 0.061988793 -0.03335136 0.22211164 -0.0023496875 0.10543881 0.015738107 -0.11722655 0.040255677 -0.28645578 -0.117378674 -0.016725404 0.04248449 -0.08492259 0.022461995 -0.108763866 -0.0005065069 -0.1188838 0.15089455 0.03371828 -0.1917253 -0.05503096 -0.051539652 0.11655843 0.20786701 -0.026647128 -0.12137232 0.10744926 -0.09407011 0.15166277 0.085610636 0.2555446 -0.015083296 -0.0798358 0.1034285 -0.22721817 -0.01943691 0.20197836 0.07306566 -0.09933606 -0.052230384 0.13510253 0.0018805577 -0.09226684 -0.20121044 0.17690231 -0.020492425 0.060497146 -0.03856501 -0.1348039 -0.07840238 0.10775316 -0.07315992 -0.030605689 0.067973755 -0.030745989 -0.09205862 0.23574686 0.17675664 -0.00489038 -0.014121906 0.0663818 0.21396486 0.15804145 0.22335975 -0.15428512 0.018491616 0.17056264 0.0023703328 -0.005144479 0.027178425 -0.08909329 0.22572803 0.03970667 -0.069989815 -0.15706807 0.024570433 0.18333714 0.16567478 -0.24253117 -0.0037482255 0.022224482 0.018520018 -0.05936141 -0.19532585 -0.013955102 0.0034956967 -0.034342013 -0.047769044 -0.21160112 0.050397076 0.1904041 -0.27010483 0.09838401 0.14096552 -0.105789125 -0.07994003 -0.11116939 0.006168318 0.06483588 0.02871086 0.061052114 0.22637305 -0.05277482 0.09836477 0.14212212 0.052983876 -0.11212492 0.01043699 -0.045089543 0.069853954 -0.021853043 0.23303291 0.10891996 0.34624174 0.07720164 0.018842692 -0.17537639 0.065415315 0.062281214 0.041409723 0.05099125 0.013758497 0.10524783 0.016542008 -0.010084982 -0.18610474 0.051893994 -0.03127661 0.1035369 -0.17339604 0.109470434 -0.21656886 0.06447648 0.1105595 -0.11183168 -0.10454787 -0.04509689 -0.008678421 0.3036383 0.048570246 0.03335835 0.042847507 -0.02997469 -0.055580124 -0.081671745 -0.0028917084 -0.080944315 0.12669 -0.16668177 -0.023343757 -0.1388582 0.021800075 -0.12612003 -0.027395174 0.13423708 0.19527224 0.09033503 0.10455928 0.04652923 -0.024046093 0.074600406 -0.08752005 -0.10372582 0.05039561 -0.09405545 -0.0019780854 0.117732845 0.21978344 -0.049822643 -0.06720394 0.013318004 -0.04046773 -0.0421535 0.042512666 0.096636884 0.08146994 0.20490798 0.00041155567 -0.039567035 0.016981898 0.27782878 0.045378942 0.14153063 -0.022803085 0.04095481 -0.23691948 -0.055436462 0.13468432 -0.117817536 0.022387687 -0.06422265 0.008768341 0.06515691 0.02714455 -0.02888842 0.21673594 0.07622385 0.16567388 0.24391475 0.17833297 -0.052876405 -0.09859826 -0.07593555 0.18415456 0.025741309 -0.11754173 -0.028055444 0.01102257 -0.17095137 -0.21360043 0.023690488 0.007879125 -0.21210076 -0.23271689 0.03511152 -0.051352426 0.121850684 0.27004912 -0.04634305 0.13857749 -0.021071441 -0.050652977 -0.05973535 -0.0695009 0.011206372 -0.032137994 0.20331028 0.040266316
of 0.12849346 -0.038374864 -0.21726947 -0.23623809 0.0072670514 -0.19325967 0.056275915 -0.18516013 -0.21954063 0.08157015 0.09655487 -0.018035423 0.029610502 -0.04109373 -0.2920059 -0.097005464 -0.16312808 0.22356857 0.26416877 -0.023835583 -0.08279553 -0.0739546 0.010587783 0.13753797 -0.17640391 0.007150682 -0.08602739 -0.07995482 -0.09335423 -0.11353922 -0.009541503 0.050460067 -0.023821043 -0.1562859 -0.054326642 0.08693013 -0.017441336 -0.20081405 0.10249073 0.069799386 -0.02312843 -0.13599575 0.14033036 -0.07791778 -0.1669349 -0.07758213 0.05589564 -0.022555452 -0.023131605 -0.33963928 0.1676876 0.01711576 -0.26108056 -0.17287576 0.030780835 -0.13593191 0.0040517957 0.04882523 0.20070645 -0.02877354 0.014941647 -0.13075219 -0.12229083 -0.029134711 0.31384766 0.21073821 -0.092495866 -0.14083135 0.14015721 0.13556193 -0.12380283 -0.08018971 0.057521775 -0.102349445 0.046707902 0.07569913 -0.09465484 0.102794304 -0.043747865 -0.07291616 0.15547566 0.019743895 -0.058097478 -0.08889357 0.2480444 0.0005270945 0.048324846 0.031630814 -0.0719861 0.20871308 0.07671626 0.1555422 0.08775004 -0.06739063 0.18764232 0.0077554504 -0.14992023 -0.0568398 -0.035382304 0.17242305 0.066531464 -0.1437938 -0.09018516 -0.064523265 0.23776047 0.039749816 -0.17001869 -0.057296786 0.08235087 0.1093241 -0.11502964 -0.1483718 0.011784136 -0.09216416 0.13866809 0.06489567 -0.29308772 -0.08711632 0.03430539 -0.06173933 0.0988205 0.23684303 -0.0771011 -0.040897246 -0.06877006 0.18571596 -0.1483957 0.14415653 0.14974982 0.05498233 -0.08508238 -0.18351726 -0.20554332 -0.14143735 -0.25095782 0.079646446 -0.055246606 0.2018746 -0.04395196 0.32548967 0.040695004 0.12978078 0.16108695 0.0625185 -0.05185656 0.14251573 -0.07218972 0.1121681 -0.02953957 0.041173093 -0.014048695 0.20571384 -0.039708406 -0.07864705 -0.1480847 -0.1952482 0.075986646 0.12970941 0.06311849 -0.16310567 0.12505066 -0.050449923 0.020367898 -0.16442178 0.049169473 0.055377696 0.2252356 -0.15418674 -0.12305761 0.15781315 -0.2697884 0.026485465 -0.091644295 0.058260724 0.043217614 0.09814414 -0.018973967 0.0031637573 -0.039911807 0.17663656 -0.14935772 0.06753029 -0.0076911305 0.2424567 0.10705174 0.14476845 1.28393e-06 -0.03634551 0.09592322 0.057333212 -0.11520119 -0.11390804 -0.10732842 -0.13405176 -0.06793697 0.18914136 -0.18525864 -0.10997568 0.13948508 0.06383391 -0.012125808 0.24806257 0.014834411 -0.04725281 0.21277687 -0.0024437162 -0.022894358 -0.17289625 0.24269715 0.19974564 0.0042658453 -0.12593499 0.08056272 -0.08520054 -0.0028169015 0.17984973 -0.13100728 0.116517514 -0.092017695 -0.0028922895 0.035654154 0.1651672 -0.0814838 0.06906618 0.0015906562 0.15338942 0.22083238 0.15202612 0.06728807 -0.40546265 -0.12677379 0.18030441 -0.15009591 -0.22312073 -0.15074345 0.08860908 -0.059699744 -0.2489533 -0.18103868 -0.12229578 -0.07385952 -0.113516115 -0.11145389 0.012051008 0.033645753 0.14587979 -0.013799987 0.21896431 -0.10061397 -0.14903513 -0.111849345 -0.12663224 -0.020129384 -0.18233676 0.28585008 0.0018139461
and -0.07127609 -0.044888906 -0.069604956 -0.25921774 0.1931875 -0.35993946 -0.22376493 -0.069725655 -0.03335043 -0.07081277 0.1505293 -0.04145995 0.013732996 -0.050636888 -0.12888664 -0.15025444 -0.018658835 0.05059064 0.013908123 0.09837051 0.015551415 0.061768904 0.06924774 0.077785715 -0.08169236 -0.1943853 -0.07962784 -0.0055806953 -0.0924115 -0.17955369 0.06326961 -0.02996396 -0.06625023 0.07338547 -0.066543646 0.08101323 -0.069021955 -0.16151203 -0.02528213 -0.046900995 0.03949358 0.11180807 0.014261134 0.11787859 -0.12339867 0.014568019 0.0711619 -0.14604051 -0.1340129 -0.15526149 0.084187284 -0.05127112 -0.021511167 -0.12523176 -0.0631306 -0.03928472 0.28775817 0.020193879 0.2020079 0.14572993 -0.11710899 -0.122083336 -0.2323962 -0.118685134 0.25138125 0.029238464 -0.27948797 0.15095307 0.113684356 0.0060326867 -0.06454202 0.1587443 0.1533965 -0.26090834 -0.015668897 0.038606722 -0.027877314 -0.04612933 -0.14147165 -0.09609122 -0.1442632 0.19278805 0.1636246 -0.013424769 0.04763345 0.055443004 -0.06524911 -0.08158471 -0.04804702 -0.061860982 0.12734857 0.09030175 -0.19224048 0.14929116 0.19901548 0.03866965 -0.25165933 -0.18131381 -0.14869653 0.16278648 -0.060493004 0.015355353 -0.35804695 -0.1561485 0.06490386 0.02698599 -0.23905157 -0.074219994 -0.0056993505 -0.009390085 -0.05424787 -0.08825547 -0.012806901 0.024632828 -0.058895 0.3275212 -0.15027127 0.10860957 -0.10491723 -0.079997554 0.17627192 0.15164022 -0.06016935 -0.099427395 0.102293625 0.07503716 0.025830502 0.011025565 0.18863241 0.095842674 -0.02334994 0.049216148 0.037897184 -0.12451066 -0.16150191 0.09453289 0.024780696 0.1409083 0.1989458 -0.18524201 -0.03180072 0.15928698 0.01372696 0.020157287 0.051102385 -0.049204513 -0.022003653 0.16866015 0.019598264 0.081629306 0.06338399 -0.030869931 -0.19415496 -0.021427343 -0.0088173235 0.036614466 0.03472175 0.10770557 0.075381026 -0.068667054 -0.11426741 -0.017114548 -0.09492375 -0.057254132 0.13729817 -0.002978448 0.2110939 -0.009718338 -0.020392923 -0.11497368 -0.0860336 0.05793075 -0.096447214 0.16152295 0.007100256 -0.041267898 0.015386765 -0.13105036 0.019013165 0.17477027 -0.051382165 -0.046271745 -0.07848718 0.049028467 -0.008928964 0.02445859 -0.078553535 -0.012971613 -0.018527266 0.098032795 -0.13494815 -0.19062604 0.07305754 0.09742698 -0.1813251 0.1709607 -0.087730974 0.03465632 0.03681193 -0.18974815 0.0032672142 0.10054808 0.23013145 -0.11754151 0.19379738 0.113429464 -0.0977608 -0.1592892 0.17366189 -0.013998513 0.023229968 -0.092417575 0.017380742 -0.042715948 -0.04976056 -0.006591005 -0.11284442 -0.07315018 0.0068230294 0.14417413 0.11669512 0.22412713 -0.15588866 -0.024499953 0.16647804 0.18170893 0.23996775 0.2264444 0.14164022 -0.11795925 -0.18683885 0.02173098 0.10543122 -0.2074476 -0.2126881 0.09402561 -0.25419122 -0.14795537 -0.058694728 0.033906493 0.0061014285 -0.008193694 -0.038711578 0.05437558 0.03968208 0.27193233 -0.17297266 0.27052182 -0.06266909 0.20434158 -0.121095255 -0.07806227 0.16117385 -0.027225586 0.103388496 0.12309575
a 0.1460243 0.009291135 -0.070897326 -0.19982591 -0.0039374433 -0.09217913 -0.019723296 -0.09426657 -0.23815659 -0.30133265 0.16817756 -0.11533515 0.1039321 0.15770115 0.1676312 -0.112769336 -0.04030134 0.13103114 -0.028364655 -0.004390045 -0.111425236 0.01437379 0.1354171 0.16137356 0.14960754 0.07047798 -0.15872504 -0.13538575 0.029882964 -0.052602045 0.13927042 0.0033258847 -0.1376521 0.07831747 0.0239025 0.17400523 -0.005157385 -0.07591702 0.18003967 -0.03431511 0.024678731 0.16883634 -0.4036753 -0.090004794 -0.22303866 0.06005245 -0.028208904 -0.09752467 0.15330155 -0.31046963 -0.032450102 0.021493385 0.09468123 -0.07694768 0.025835665 -0.057729315 0.12521097 -0.022502227 0.14487147 0.110956356 -0.21407531 0.22676559 -0.082286395 0.018469516 0.1570545 0.048241407 -0.021728424 -0.23072667 0.05175059 -0.017774303 0.118273765 -0.0068588126 0.00067813956 -0.18124273 0.062585056 -0.007102416 -0.3891113 -0.028732155 -0.02982666 -0.16555695 0.11130756 0.17308195 0.019928372 0.14940523 0.15114002 -0.055666674 -0.04444017 0.18497764 0.19627818 0.11543113 -0.0961005 0.18745194 -0.09663762 0.018309029 0.039077163 -0.011236846 0.04658392 -0.072881006 -0.2766687 0.19521868 0.017404975 0.09103004 -0.10726668 -0.2510361 0.22799626 0.22212929 -0.15939544 0.27413008 0.17323822 0.25040618 0.20039868 -0.012740003 0.06787288 0.059162945 -0.016469594 0.0667544 -0.15959866 0.12102945 0.09797528 -0.039252114 0.035937246 0.32028618 0.0001366718 -0.01233205 0.008333325 -0.06594258 -0.018763652 0.10986244 0.040864844 0.31182903 -0.05635474 0.097195335 -0.1333675 -0.032866254 -0.03712148 -0.11570746 0.098269954 -0.0521253 0.11730674 0.4754777 0.026888603 0.37014696 0.09019004 0.060977828 0.04314345 0.14688492 0.03646329 0.13031942 0.058190007 0.14034115 0.04540529 -0.23482245 0.06392817 0.09341687 0.025591403 0.02887323 -0.12837067 -0.032988723 0.2353103 -0.08990258 0.12432604 0.121057145 0.00047103676 -0.15358552 0.029308995 0.22161147 -0.03279219 -0.03655143 -0.10547799 0.022021394 -0.3026493 -0.25846928 0.03746424 0.19017604 0.107492544 0.15230107 -0.19104506 -0.14910738 -0.3294074 -0.15406993 -0.046884008 0.034522988 0.14234833 0.13902944 0.11808014 -0.024600472 -0.12648137 0.06334348 -0.07332069 -0.2663435 -0.2340481 0.1243913 0.13363732 -0.2923613 0.1624614 -0.048237745 0.12209708 -0.15457326 0.016372519 -0.09840412 -0.2732631 -0.0737498 0.14323488 0.120646216 0.31462902 0.08176914 0.00034253258 -0.20974436 0.1376393 -0.15025102 0.34007868 -0.00021151823 0.05146118 -0.16213322 0.047798127 0.2277937 -0.06408666 0.28979453 -0.047407314 -0.0076011354 0.16277671 0.1339205 -0.0005787103 0.06595009 -0.014407454 0.27107397 0.20547497 0.12841573 -0.14764163 -0.04147658 -0.28799087 0.13695757 0.11891036 0.10463409 -0.047459874 0.0666384 -0.11371216 -0.18668233 -0.14313085 0.04650518 -0.16257775 -0.07998014 -0.13793401 -0.009789161 0.10371586 0.33527792 -0.30386218 0.23032843 0.07526428 0.13549331 0.1392237 -0.11768942 0.05857614 -0.009856257 0.19396895 0.09976028
to 0.009508243 0.025056798 -0.1888818 -0.108808935 0.1166143 -0.31678423 -0.096842416 -0.095511034 0.05434912 -0.28438264 -0.012984291 0.067240246 0.017016243 -0.066931695 -0.052149177 -0.022251718 -0.12875952 0.33058313 0.23176688 -0.06265742 0.043026675 -0.012162347 -0.10742255 0.42515785 -0.21856625 0.10129418 -0.28219578 -0.2929 -0.13362235 -0.18561041 0.09423367 0.08472438 0.052957777 0.094379365 -0.09831079 -0.061776046 0.0069797607 -0.07679731 -0.104270756 -0.25312677 -0.01560412 -0.23872016 0.021601556 0.027156608 -0.14678824 -0.11802955 -0.16728356 -0.08393181 -0.016759047 -0.22890973 0.06526165 -0.055502776 0.1440748 -0.022989353 0.066030346 0.16313787 0.25201705 -0.049159225 0.1350282 0.019334292 0.09439728 0.012081764 -0.20356593 -0.067910776 0.2816524 0.0026324838 -0.4179653 -0.04180346 0.19838978 -0.033196595 0.033470407 0.005779976 -0.06209829 0.037718475 0.12045118 0.06836918 -0.24771804 0.2922861 -0.087355345 -0.01786293 0.06407181 -0.15425053 -0.0050302553 -0.021128532 -0.14219679 0.10401652 0.2584612 0.029185781 0.19695584 0.09083843 0.212602 0.11682628 -0.064519554 0.05300785 0.10728403 0.098207176 0.14719008 -0.3115979 -0.09182627 0.2979959 0.21802115 0.009801334 -0.03467545 -0.17110303 0.15496011 -0.08004846 -0.2142226 0.19082563 -0.12361031 -0.1189105 -0.24657401 -0.104220346 0.26430985 0.1354535 -0.10230196 0.006260716 -0.03509745 0.25165272 -0.18031928 -0.18463656 0.063109666 0.2572282 -0.22851129 -0.02322408 0.06940204 0.08116156 0.02169068 0.30836102 0.07563998 0.27068296 0.0036385932 0.078987665 0.03960993 -0.022047138 -0.014849979 0.096663326 -0.082392536 0.16706271 0.056396395 0.14581919 0.041111913 -0.15213034 0.20041597 0.3156818 -0.09725025 -0.11398526 -0.043844286 0.057301573 0.24547406 0.049937088 0.2076644 0.075304344 -0.114357024 -0.17312455 0.115742855 0.08157848 -0.024816502 -0.088214345 0.2166391 0.006649788 -0.14037888 0.17890425 -0.16528982 0.17021395 0.025162999 0.05510353 0.0025275175 -0.00892457 -0.07367319 -0.007118128 -0.14382295 0.02514549 -0.08521854 -0.08558922 -0.26186407 0.37946188 0.06351345 -0.042617775 -0.25938115 -0.009181016 0.11695232 -0.2286682 -0.12307283 0.21207622 -0.04907598 0.13635153 -0.013098404 -0.23185265 -0.026282128 -0.03515865 -0.14201619 -0.06784334 0.17914844 -0.2609765 -0.12641548 0.17994317 -0.24493636 -0.07834588 0.14596178 -0.12051357 0.10891805 -0.0559813 0.078625485 -0.015418917 0.21985866 -0.0307393 -0.29147425 -0.2007735 0.16988863 0.060827076 0.21115167 -0.08064383 -0.01132488 0.037332974 0.08871327 0.0972882 -0.09270838 -0.0008950954 -0.27379093 0.17267276 0.19952214 0.02255501 0.18309216 -0.12630287 0.07139318 0.17480838 0.05036306 0.16628519 0.12490674 0.02571364 -0.12548324 0.045334835 0.052382912 -0.17804338 -0.11704747 0.34308958 -0.059135444 -0.14187443 -0.25298917 -0.09867575 -0.030797388 0.034271266 0.033939827 -0.0796236 0.062186338 0.08325272 -0.11313803 0.17034255 -0.027818222 0.1377922 -0.15758857 -0.04943884 0.13024627 0.09669394 0.26601818 0.006948458
in 0.012440086 0.11040719 -0.012849666 0.09290964 -0.07972144 -0.33584738 -0.12601718 -0.2595267 -0.33727908 -0.069633365 -0.0163871 -0.008740522 -0.017254686 -0.0050155944 -0.049694307 -0.010404902 -0.058033124 0.1280864 -0.09175434 0.13326514 0.10769238 0.17952108 0.28240898 0.4521449 -0.07934836 0.067488454 -0.0048905965 0.09992221 0.15219735 -0.113293305 -0.023758773 0.13264394 -0.11525996 -0.07501112 -0.21296655 0.003916659 -0.108518355 -0.12429437 -0.019828321 -0.24240857 -0.1522041 -0.14509949 0.03231436 0.08443054 -0.026866911 -0.10020908 0.097492106 -0.13915308 -0.098680526 -0.1081047 -0.0063557313 0.020586161 -0.20985532 -0.16047233 -0.21008705 0.034407597 0.3135808 -0.014134962 0.030835181 0.07095385 0.02675936 0.006355572 -0.014918085 0.010313049 0.1399904 0.0525679 -0.26099658 -0.01622818 0.12540202 0.034225248 -0.21008274 -0.0048396774 0.09516723 -0.09548964 0.18812294 0.046164535 -0.13768989 0.04219282 0.007528147 0.04331836 -0.050952625 0.07549312 0.23432745 -0.006662966 -0.0030097812 0.009488655 0.012482486 -0.0069720955 0.13791682 -0.048437305 0.04701792 0.16282237 -0.024947353 0.0050850054 0.3893829 0.059020057 -0.0029280712 -0.09437448 -0.13447046 0.11470259 -0.20852034 -0.11440712 0.0035576236 0.008134062 0.12327329 0.08950375 -0.008435103 0.054893654 0.27052206 0.07247013 0.05277092 -0.36001533 -0.10384121 -0.052010894 -0.21103063 -0.024597147 -0.443716 0.0740373 0.08608065 -0.13342541 0.06696039 0.09000352 0.046407126 0.018959345 -0.19714354 0.06871908 -0.11959734 0.30296963 0.114974216 0.029571522 -0.0881721 0.072081834 -0.010052683 0.0044189724 -0.10687096 0.07511523 0.14135349 -0.031247873 0.115160786 0.09375827 -0.040305912 0.06954596 0.10366585 0.04164465 -0.21187215 0.02753605 -0.038084142 -0.18221796 0.03952976 0.08004701 -0.036739275 -0.10140312 -0.1477339 0.03714309 0.44821632 0.034289815 0.118066356 -0.028453646 0.12973107 -0.32193786 -0.115426496 0.45779133 -0.15139781 -0.076242864 0.108048454 0.05240559 0.22419807 -0.1365141 -0.21360777 0.14030857 -0.02776601 -0.020115474 -0.07066597 0.020898372 0.014558452 0.2506441 -0.05754328 -0.041735463 -0.12249524 -0.10760245 -0.114811294 -0.22330247 0.24388424 0.234468 0.04530107 0.069316186 -0.008741171 -0.12846868 0.16011412 0.05163249 -0.10586641 -0.1381455 -0.0063932473 -0.18183662 -0.026927002 -0.03217512 0.11633275 -0.10052802 0.21392068 -0.080930054 0.0438342 0.16460107 0.13401955 -0.10111292 0.15316881 0.031120954 -0.11848464 -0.047609225 0.5042394 0.17039074 0.29279602 -0.09230814 0.22846758 -0.029289516 -0.052461695 -0.006335409 -0.007340484 0.054269854 -0.09596643 -0.052444436 0.13412166 0.040756818 -0.16310513 0.3544867 0.014325076 0.29252282 0.08338105 0.07259305 0.13306075 -0.4998001 0.070836864 0.115676746 0.14786462 -0.15317042 -0.053047426 -0.118147776 -0.044508033 -0.10369096 -0.23159377 -0.059542064 0.049224064 -0.01353095 -0.31027538 0.027299901 0.17420498 0.12596332 -0.124590844 0.030096972 -0.12940277 0.04427203 0.07341154 0.07180236 0.067514785 -0.009053707 0.1675022 0.1581485
for -0.109976515 0.1686066 -0.34713957 -0.09134372 0.061696 -0.0019212356 -0.21205811 -0.15444273 -0.11584609 -0.1442363 -0.086657755 -0.040365797 0.083306134 -0.037432164 -0.08356189 0.023981024 0.0015174978 -0.0027167809 0.033005223 -0.12038482 -0.05258101 0.1834867 0.19345471 0.5756231 -0.25596714 0.12485763 0.07177012 0.010200464 -0.026214033 -0.09882409 -0.034797296 0.11594001 0.025715355 -0.0334732 -0.09303828 -0.028010204 -0.05105308 -0.32736382 0.004009488 -0.007075146 -0.11160078 -0.08461752 0.22652772 -0.049335904 -0.14651984 0.18009186 0.26992032 -0.11769892 -0.204322 -0.14354987 0.41121575 0.047323454 0.095121756 -0.061089292 0.18136063 -0.0046440926 0.24948792 -0.07826458 -0.03625476 0.09095407 -0.014378121 -0.070760965 -0.047818948 0.22109671 -0.15034196 0.24253963 -0.055900984 -0.09104821 -0.015753204 -0.11691716 -0.14280042 0.035703 0.14692362 -0.2709018 0.11283822 0.15952697 -0.15262556 -0.06532217 -0.13614954 0.093258746 0.23890424 0.19621842 0.13096257 -0.007378332 0.19430582 0.056934252 0.19916107 -0.125118 -0.018377285 0.22431874 0.06004092 0.14662232 0.108341254 0.13508536 0.16714472 -0.017511673 0.04328922 -0.24951583 0.034837235 0.2442482 0.23472343 -0.0042018527 -0.12109434 -0.012613255 0.011046199 -0.02523405 -0.2688494 -0.2086474 0.0048128082 0.13123842 0.2662243 -0.1918557 -0.0214424 0.07719561 -0.20078538 0.14483409 -0.21974912 0.079592474 -0.14877233 -0.18281712 0.01845022 0.24106319 0.07042614 -0.0627384 0.07755576 -0.02893163 -0.21092972 -0.0020360611 0.04802305 0.22096018 -0.15574396 -0.2057627 -0.26480713 -0.012858803 0.12132718 -0.08236549 -0.0014762381 0.042447194 -0.043262117 0.074755326 0.08228459 -0.031535044 0.18531218 -0.0785617 0.05814554 -0.109967075 0.2622751 0.120642886 0.121205255 0.22108686 -0.06361863 -0.18851177 -0.05038249 0.14845026 0.16023317 -0.008915233 -0.15640375 -0.01384065 0.17057164 -0.20944878 0.063283406 -0.10566992 -0.03836688 -0.10028407 0.11171135 0.13036416 0.13478366 -0.20928559 -0.017080816 0.16376291 -0.3356237 -0.17640394 -0.07976006 -0.06104731 -0.05945607 0.29505745 0.1385816 -0.3698567 -0.22177996 -0.030307714 0.032171328 0.08498917 -0.13814233 0.1045589 0.08385398 0.13340573 0.07507179 0.16909795 0.26108184 0.056948207 -0.019535016 -0.0047397576 -0.20367156 0.029300513 -0.040819988 0.16727361 0.011016839 -0.041682925 0.022323234 -0.1929392 -0.19995897 0.086446226 0.15849206 -0.05596133 0.45048237 -0.0025659115 -0.053108044 -0.20979203 0.43950045 -0.06991201 0.32109445 -0.045528013 0.12520728 -0.18475004 -0.020132573 0.20525149 -0.19963227 -0.096442446 -0.1354165 0.27810615 0.091834925 0.058895927 -0.080994174 -0.037580542 0.013910849 0.10009435 -0.027952384 0.11222178 0.12904337 -0.24447812 -0.50730294 -0.0742851 0.017202623 -0.16630048 -0.036739457 -0.03772698 0.047964294 -0.39169553 -0.31868938 0.100314826 0.056332733 -0.0068960525 0.011943942 -0.013493331 -0.0641076 0.24290138 -0.16833131 0.23019645 -0.21336776 0.040684633 -0.12813863 -0.22319983 0.17009361 0.13051772 0.099689655 0.20140417

  第一行的两个数字分别表示word的数量,第二数字表示向量的维度,下面的每一行的长度都是257(只截取了一小部分)

构建向量矩阵
print('word embedding')
embeddings_index = {}
word_index={}
embedding_max_value = 0
embedding_min_value = 1
i=1
with open(config.WORD_EMBEDDING_DIR, 'r') as f:
    for line in f:
        line = line.strip().split(' ')
        if len(line) != 257:
            print("error!")
            word_num=int(line[0])
            continue        
        coefs = np.asarray(line[1:], dtype='float32')
        if np.max(coefs) > embedding_max_value:
            embedding_max_value = np.max(coefs)
        if np.min(coefs) < embedding_min_value:
            embedding_min_value = np.min(coefs)
        embeddings_index[line[0]] = coefs
        word_index[line[0]]=i
        i=i+1
embedword_matrix = np.zeros((word_num+1, 256))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedword_matrix[i] = embedding_vector
    else:#没有找到的词会被当做全0向量处理
        embedword_matrix[i] = np.random.uniform(low=embedding_min_value, high=embedding_max_value,size=256)

根据子空间对论文进行筛选,形成5个集合

训练数据集合构造思路
在每个子空间上
1.构造100个正样本对 100个负样本对
说明:每个样本对的输入由四部分组成:
①论文对中这两篇论文的句子的序列化表示(最终得到一个向量)
②规则一得到一个数字
③规则二得到一个数字
④规则三得到一个数字
综上:将上述四部分拼接成一个向量 (即每个样本对都使用一个向量来表示, 然后通过一个全连接层得到一个范围为(0,1)的数字,正样本对为1,负样本对为0)
关键代码:

# 子空间0
SubSpace0_dict={}
paper_id_list=[]
temp=[]
for each in sentence_textcnn_list:
    if each[-2]==0:
        paper_id=each[1]
        if paper_id in paper_id_list:
            SubSpace0_dict[paper_id]=SubSpace0_dict[paper_id]+each[-1]
        else:
            SubSpace0_dict[paper_id]=each[-1]
    
        paper_id_list.append(paper_id)

# Save
np.save('data/SubSpace0_dict.npy', SubSpace0_dict) 


# Load
SubSpace0_dict = np.load('data/SubSpace0_dict.npy').item()


在五个子空间上创建训练集

  因为没有标记好的训练集,所以为了训练模型,打算自己制作包含label的200个左右的训练集样本,思路就是可以先依靠文本相似度进行筛选,然后在此基础上在进行处理

计算文本相似度的函数
class DocumentSimilar(object):
    def __init__(self, documents):
        self.documents = documents
        self.dictionary = None
        self.tfidf = None
        self.similar_matrix = None
        self.calculate_similar_matrix()
    @staticmethod
    def split_word(document):
        """
        分词,去除停用词
        """
        text=document.split(" ")
        return text
    def calculate_similar_matrix(self):
        """
        计算相似度矩阵及一些必要数据
        """ 
        words = [self.split_word(document) for document in self.documents]
        self.dictionary = corpora.Dictionary(words)
        corpus = [self.dictionary.doc2bow(word) for word in words]
        self.tfidf = models.TfidfModel(corpus)
        corpus_tfidf = self.tfidf[corpus]
        self.similar_matrix = similarities.Similarity("",corpus_tfidf,len(self.dictionary))
     def get_similar(self, document):
        """
        计算要比较的文档与语料库中每篇文档的相似度
        """
        words = self.split_word(document)
        corpus = self.dictionary.doc2bow(words)
        corpus_tfidf = self.tfidf[corpus]
        return self.similar_matrix[corpus_tfidf]
    


计算列表中第二大的数
def SecMax(list):
    list.sort()
    count=list.count(list[len(list) - 1] )
    c = 0
    while c < count:
        list.pop()
        c+=1
    return list[len(list) - 1]    
    
筛选

这里只拿出子空间0的实例,其他空间进行相似处理:

#子空间0
documents0=[]
index_list0=[]
for key,value in SubSpace0_dict.items():
    index_list0.append(key)
    documents0.append(value)
SubSpace0_train_pairs=[]
SubSpace0_train_pairs_=[]
doc_similar0=DocumentSimilar(documents0)
for key,value in SubSpace0_dict.items():
    a=key
    temp=list(doc_similar0.get_similar(value))
    temp_=temp.copy()
    Second_Num=SecMax(temp_)
    min_num=min(temp)
    Second_Maxnum_Index=temp.index(Second_Num)
    min_num_index=temp.index(min_num)
    c=index_list0[min_num_index]
    b=index_list0[Second_Maxnum_Index]
    SubSpace0_train_pairs.append([a,b,Second_Num])
    SubSpace0_train_pairs_.append([a,c,min_num_index])


在每个子空间挑出100个作为正样本对,100个作为负样本对

这里只拿出子空间0的实例,其他空间进行相似处理:

#子空间0
#100个正样本对
list0=[each[-1] for each in SubSpace0_train_pairs]
array0=np.array(list0)
Max100_0=list(array0.argsort()[-100:][::-1])
Max100_0_pos=[]
for each in Max100_0:
    temp=[]
    temp_li=SubSpace0_train_pairs[int(each)]
    temp.append(int(temp_li[0]))
    temp.append(int(temp_li[1]))
    Max100_0_pos.append(temp)

上述内容详见:

https://blog.csdn.net/qq_43665502

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值