<?xml version="1.0"?>
<?xml-stylesheet type="text/css" href="http://www.cslt.org/mediawiki/skins/common/feed.css?303"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="zh-cn">
		<id>http://www.cslt.org/mediawiki/index.php?action=history&amp;feed=atom&amp;title=ASR%3A2015-12-28</id>
		<title>ASR:2015-12-28 - 版本历史</title>
		<link rel="self" type="application/atom+xml" href="http://www.cslt.org/mediawiki/index.php?action=history&amp;feed=atom&amp;title=ASR%3A2015-12-28"/>
		<link rel="alternate" type="text/html" href="http://www.cslt.org/mediawiki/index.php?title=ASR:2015-12-28&amp;action=history"/>
		<updated>2026-04-03T23:08:19Z</updated>
		<subtitle>本wiki的该页面的版本历史</subtitle>
		<generator>MediaWiki 1.23.3</generator>

	<entry>
		<id>http://www.cslt.org/mediawiki/index.php?title=ASR:2015-12-28&amp;diff=18573&amp;oldid=prev</id>
		<title>Zxw：/* Speech Processing */</title>
		<link rel="alternate" type="text/html" href="http://www.cslt.org/mediawiki/index.php?title=ASR:2015-12-28&amp;diff=18573&amp;oldid=prev"/>
				<updated>2016-01-09T01:30:16Z</updated>
		
		<summary type="html">&lt;p&gt;‎&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;Speech Processing&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class='diff diff-contentalign-left'&gt;
				&lt;col class='diff-marker' /&gt;
				&lt;col class='diff-content' /&gt;
				&lt;col class='diff-marker' /&gt;
				&lt;col class='diff-content' /&gt;
				&lt;tr style='vertical-align: top;'&gt;
				&lt;td colspan='2' style=&quot;background-color: white; color:black; text-align: center;&quot;&gt;←上一版本&lt;/td&gt;
				&lt;td colspan='2' style=&quot;background-color: white; color:black; text-align: center;&quot;&gt;2016年1月9日 (六) 01:30的版本&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;第10行：&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;第10行：&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class='diff-marker'&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f9f9f9; color: #333333; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #e6e6e6; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;:* CTC/nnet3/Kaldi&lt;/div&gt;&lt;/td&gt;&lt;td class='diff-marker'&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f9f9f9; color: #333333; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #e6e6e6; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;:* CTC/nnet3/Kaldi&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class='diff-marker'&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f9f9f9; color: #333333; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #e6e6e6; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;:* more refered ruselts of Kaldi/CTC on 1400h-Chinese and plus 100h English&lt;/div&gt;&lt;/td&gt;&lt;td class='diff-marker'&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f9f9f9; color: #333333; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #e6e6e6; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;:* more refered ruselts of Kaldi/CTC on 1400h-Chinese and plus 100h English&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt;&amp;#160;&lt;/td&gt;&lt;td class='diff-marker'&gt;+&lt;/td&gt;&lt;td style=&quot;color:black; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;:* Further test nnet3-ctc training on 4000h 8k &amp;amp; 10000h 16k dataset. Tune decoding configures (phone-lm-weight, acwt, blank-scale), but ctc is still about 5 percent worse than standard nnet3&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt;&amp;#160;&lt;/td&gt;&lt;td class='diff-marker'&gt;+&lt;/td&gt;&lt;td style=&quot;color:black; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;:* launched experiments of mpe after CTC on WSJ, code rivised&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt;&amp;#160;&lt;/td&gt;&lt;td class='diff-marker'&gt;+&lt;/td&gt;&lt;td style=&quot;color:black; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;:* experiments of &amp;quot;chain&amp;quot; on WSJ&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class='diff-marker'&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f9f9f9; color: #333333; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #e6e6e6; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class='diff-marker'&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f9f9f9; color: #333333; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #e6e6e6; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class='diff-marker'&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f9f9f9; color: #333333; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #e6e6e6; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;====Adapative learning rate method====&lt;/div&gt;&lt;/td&gt;&lt;td class='diff-marker'&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f9f9f9; color: #333333; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #e6e6e6; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;====Adapative learning rate method====&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Zxw</name></author>	</entry>

	<entry>
		<id>http://www.cslt.org/mediawiki/index.php?title=ASR:2015-12-28&amp;diff=18572&amp;oldid=prev</id>
		<title>Zxw：以“==Speech Processing ==  === AM development ===  ==== Environment ====  ==== End-to-End ==== *monophone ASR --Zhiyuan :* MPE :* CTC/nnet3/Kaldi :* more refered ruselt...”为内容创建页面</title>
		<link rel="alternate" type="text/html" href="http://www.cslt.org/mediawiki/index.php?title=ASR:2015-12-28&amp;diff=18572&amp;oldid=prev"/>
				<updated>2016-01-09T01:26:42Z</updated>
		
		<summary type="html">&lt;p&gt;以“==Speech Processing ==  === AM development ===  ==== Environment ====  ==== End-to-End ==== *monophone ASR --Zhiyuan :* MPE :* CTC/nnet3/Kaldi :* more refered ruselt...”为内容创建页面&lt;/p&gt;
&lt;p&gt;&lt;b&gt;新页面&lt;/b&gt;&lt;/p&gt;&lt;div&gt;==Speech Processing ==&lt;br /&gt;
&lt;br /&gt;
=== AM development ===&lt;br /&gt;
&lt;br /&gt;
==== Environment ====&lt;br /&gt;
&lt;br /&gt;
==== End-to-End ====&lt;br /&gt;
*monophone ASR --Zhiyuan&lt;br /&gt;
:* MPE&lt;br /&gt;
:* CTC/nnet3/Kaldi&lt;br /&gt;
:* more refered ruselts of Kaldi/CTC on 1400h-Chinese and plus 100h English&lt;br /&gt;
&lt;br /&gt;
====Adapative learning rate method====&lt;br /&gt;
* sequence training -Xiangyu&lt;br /&gt;
:* write a technique report &lt;br /&gt;
&lt;br /&gt;
==== Mic-Array ====&lt;br /&gt;
* hold &lt;br /&gt;
* compute EER with kaldi&lt;br /&gt;
&lt;br /&gt;
====Data selection unsupervised learning====&lt;br /&gt;
* hold&lt;br /&gt;
* acoustic feature based submodular using Pingan dataset --zhiyong&lt;br /&gt;
* write code to speed up --zhiyong&lt;br /&gt;
* curriculum learning --zhiyong&lt;br /&gt;
&lt;br /&gt;
====RNN-DAE(Deep based Auto-Encode-RNN)====&lt;br /&gt;
* hold&lt;br /&gt;
* RNN-DAE has worse performance than DNN-DAE because training dataset is small &lt;br /&gt;
* extract real room impulse to generate WSJ reverberation data, and then train RNN-DAE  &lt;br /&gt;
&lt;br /&gt;
===Speaker recognition=== &lt;br /&gt;
* DNN-ivector framework&lt;br /&gt;
* SUSR &lt;br /&gt;
* AutoEncoder + metric learning&lt;br /&gt;
* binary ivector&lt;br /&gt;
* Deep speaker embedding tasks&lt;br /&gt;
&lt;br /&gt;
===language vector===&lt;br /&gt;
* write a paper--zhiyuan&lt;br /&gt;
:*hold&lt;br /&gt;
* language vector is added to multi hidden layers--zhiyuan &lt;br /&gt;
:* write code done&lt;br /&gt;
:* check code &lt;br /&gt;
:*http://192.168.0.51:5555/cgi-bin/cvss/cvss_request.pl?account=zxw&amp;amp;step=view_request&amp;amp;cvssid=480&lt;br /&gt;
* RNN language vector&lt;br /&gt;
:*hold&lt;br /&gt;
* language vector into multiple layers --Zhiyuan &lt;br /&gt;
:* a Chinese paper&lt;br /&gt;
* speech rate into multiple layers --Zhiyuan&lt;br /&gt;
:*verify the code for extra input(s) into DNN&lt;br /&gt;
&lt;br /&gt;
===multi-GPU===&lt;br /&gt;
* multi-stream training --Sheng Su&lt;br /&gt;
:* write a technique report &lt;br /&gt;
* kaldi-nnet3 --Xuewei&lt;br /&gt;
:* 7*2048 8k 1400h tdnn training Xent done&lt;br /&gt;
:* nnet3 mpe code is under investigation&lt;br /&gt;
:*http://192.168.0.51:5555/cgi-bin/cvss/cvss_request.pl?account=zxw&amp;amp;step=view_request&amp;amp;cvssid=472&lt;br /&gt;
:*Analysed the MPE divergence problem when the context is 10. The reason may be ascribed to the Over-fitting. Large data-set may weaken the over-fitting phenomenon.&lt;br /&gt;
*RNN AM training on big dataset --mengyuan&lt;br /&gt;
:* fix decode bug&lt;br /&gt;
:* nnet3 lstm &amp;amp; blstm training on sinovoice 120h dataset, using KALDI's default config. but result is worse than tdnn&lt;br /&gt;
:* Test nnet3-MPE code from Xuewei on sinovoice 120h 16k dataset. Didn't observe performance improvement. There are still some bugs in the code.&lt;br /&gt;
:* Run nnet3-ctc training on sinovoice 120h 16k dataset. Result looks ok, but worse than normal model.&lt;br /&gt;
:* Start nnet3-ctc training on sinovoice 4000h 8k dataset.&lt;br /&gt;
* train mpe  --Zhiyong,Xuewei&lt;br /&gt;
:*train nnet3 mpe using data from Jietong--Xuewei&lt;br /&gt;
:* modify code to print stats --Xuewei&lt;br /&gt;
:* The MPE does not work when the context is 10, need to further investigated--zhiyong&lt;br /&gt;
:* The nnet1 MPE we test is also based on context 5, may larger context is an inherent problem--zhiyong&lt;br /&gt;
:*modify code to reduce memory&lt;br /&gt;
&lt;br /&gt;
===multi-task===&lt;br /&gt;
* test according to selt-information neural structure learning --mengyuan&lt;br /&gt;
:* hold&lt;br /&gt;
:* write code done&lt;br /&gt;
:* no significant performance improvement observed &lt;br /&gt;
* speech rate learning --xiangyu&lt;br /&gt;
:* hold&lt;br /&gt;
:* no significant performance improvement observed&lt;br /&gt;
:*http://192.168.0.51:5555/cgi-bin/cvss/cvss_request.pl?account=zxw&amp;amp;step=view_request&amp;amp;cvssid=483&lt;br /&gt;
: get results with extra input of speech rate info --Zhiyuan&lt;br /&gt;
&lt;br /&gt;
===30 chinese dataset===&lt;br /&gt;
* revise syllable text &lt;br /&gt;
* add ome words in lexicon, which is applied to training and making graph. &lt;br /&gt;
* train and decode 30 chinese data again&lt;br /&gt;
* revise the techinique report&lt;br /&gt;
*prepare data&lt;br /&gt;
*kaldi recipe&lt;br /&gt;
&lt;br /&gt;
==Text Processing==&lt;br /&gt;
===Work===&lt;br /&gt;
====RNN Poem Process====&lt;br /&gt;
* Combine addition rhyme.&lt;br /&gt;
* Investigate new method.&lt;br /&gt;
====Document Represent====&lt;br /&gt;
* Code done. Wait some experiments result.&lt;br /&gt;
====Seq to Seq====&lt;br /&gt;
* Work on some tasks.&lt;br /&gt;
====Order representation ====&lt;br /&gt;
* Code some idea.&lt;br /&gt;
====Balance Representation====&lt;br /&gt;
* Investigate some papers.&lt;br /&gt;
* Current solution : Use knowledge or large corpus's similar pair.&lt;br /&gt;
&lt;br /&gt;
===Hold===&lt;br /&gt;
====Neural Based Document Classification====&lt;br /&gt;
====RNN Rank Task====&lt;br /&gt;
====Graph RNN====&lt;br /&gt;
:* Entity path embeded to entity.&lt;br /&gt;
*(hold)&lt;br /&gt;
====RNN Word Segment====&lt;br /&gt;
:* Set bound to word segment. &lt;br /&gt;
* (hold)&lt;br /&gt;
====Recommendation====&lt;br /&gt;
* Reproduce baseline.&lt;br /&gt;
:*LDA matrix dissovle.&lt;br /&gt;
:* LDA (Text classification &amp;amp; Recommendation System) --&amp;gt; AAAI&lt;br /&gt;
====RNN based QA====&lt;br /&gt;
*Read Source Code.&lt;br /&gt;
*Attention based QA.&lt;br /&gt;
*Coding.&lt;br /&gt;
&lt;br /&gt;
===Text Group Intern Project===&lt;br /&gt;
====Buddhist Process====&lt;br /&gt;
:*(hold)&lt;br /&gt;
&lt;br /&gt;
====RNN Poem Process====&lt;br /&gt;
*Done by Haichao yu &amp;amp; Chaoyuan zuo Mentor : Tianyi Luo.&lt;br /&gt;
&lt;br /&gt;
====RNN Document Vector====&lt;br /&gt;
:*(hold)&lt;br /&gt;
&lt;br /&gt;
====Image Baseline====&lt;br /&gt;
:*Demo Release.&lt;br /&gt;
:*Paper Report.&lt;br /&gt;
*Read CNN Paper.&lt;br /&gt;
&lt;br /&gt;
===Text Intuitive Idea===&lt;br /&gt;
====Trace Learning====&lt;br /&gt;
* (Hold)&lt;br /&gt;
====Match RNN ====&lt;br /&gt;
* (Hold)&lt;br /&gt;
&lt;br /&gt;
=financial group=&lt;br /&gt;
==model research==&lt;br /&gt;
* RNN&lt;br /&gt;
:* online model, update everyday&lt;br /&gt;
:* modify cost function and learning method&lt;br /&gt;
:* add more feature&lt;br /&gt;
==rule combination==&lt;br /&gt;
* GA method to optimize the model&lt;br /&gt;
&lt;br /&gt;
==basic rule==&lt;br /&gt;
* classical tenth model&lt;br /&gt;
==multiple-factor==&lt;br /&gt;
* add more factor&lt;br /&gt;
* use sparse model &lt;br /&gt;
==display==&lt;br /&gt;
* bug fixed&lt;br /&gt;
:* buy rule fixed&lt;br /&gt;
==data==&lt;br /&gt;
* data api&lt;br /&gt;
:* download the future data and factor data&lt;/div&gt;</summary>
		<author><name>Zxw</name></author>	</entry>

	</feed>