<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
	<id>https://primo.ai/index.php?action=history&amp;feed=atom&amp;title=State_Space_Model_%28SSM%29</id>
	<title>State Space Model (SSM) - Revision history</title>
	<link rel="self" type="application/atom+xml" href="https://primo.ai/index.php?action=history&amp;feed=atom&amp;title=State_Space_Model_%28SSM%29"/>
	<link rel="alternate" type="text/html" href="https://primo.ai/index.php?title=State_Space_Model_(SSM)&amp;action=history"/>
	<updated>2026-06-14T17:00:06Z</updated>
	<subtitle>Revision history for this page on the wiki</subtitle>
	<generator>MediaWiki 1.31.0</generator>
	<entry>
		<id>https://primo.ai/index.php?title=State_Space_Model_(SSM)&amp;diff=39210&amp;oldid=prev</id>
		<title>BPeat at 15:21, 28 May 2025</title>
		<link rel="alternate" type="text/html" href="https://primo.ai/index.php?title=State_Space_Model_(SSM)&amp;diff=39210&amp;oldid=prev"/>
		<updated>2025-05-28T15:21:54Z</updated>

		<summary type="html">&lt;p&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 15:21, 28 May 2025&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l2&quot; &gt;Line 2:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 2:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;|title=PRIMO.ai&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;|title=PRIMO.ai&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;|titlemode=append&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;|titlemode=append&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;−&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;|keywords=ChatGPT, artificial, intelligence, machine, learning, &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;GPT-4, GPT-5, &lt;/del&gt;NLP, NLG, NLC, NLU, models, data, singularity, moonshot, Sentience, AGI, Emergence, Moonshot, Explainable, TensorFlow, Google, Nvidia, Microsoft, Azure, Amazon, AWS, Hugging Face, OpenAI, Tensorflow, OpenAI, Google, Nvidia, Microsoft, Azure, Amazon, AWS, Meta, LLM, metaverse, assistants, agents, digital twin, IoT, Transhumanism, Immersive Reality, Generative AI, Conversational AI, Perplexity, Bing, You, Bard, Ernie, prompt Engineering LangChain, Video/Image, Vision, End-to-End Speech, Synthesize Speech, Speech Recognition, Stanford, MIT |description=Helpful resources for your journey with artificial intelligence; videos, articles, techniques, courses, profiles, and tools&amp;#160; &amp;#160;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;|keywords=ChatGPT, artificial, intelligence, machine, learning, &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt; &lt;/ins&gt;NLP, NLG, NLC, NLU, models, data, singularity, moonshot, Sentience, AGI, Emergence, Moonshot, Explainable, TensorFlow, Google, Nvidia, Microsoft, Azure, Amazon, AWS, Hugging Face, OpenAI, Tensorflow, OpenAI, Google, Nvidia, Microsoft, Azure, Amazon, AWS, Meta, LLM, metaverse, assistants, agents, digital twin, IoT, Transhumanism, Immersive Reality, Generative AI, Conversational AI, Perplexity, Bing, You, Bard, Ernie, prompt Engineering LangChain, Video/Image, Vision, End-to-End Speech, Synthesize Speech, Speech Recognition, Stanford, MIT |description=Helpful resources for your journey with artificial intelligence; videos, articles, techniques, courses, profiles, and tools&amp;#160; &amp;#160;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;!-- Google tag (gtag.js) --&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;!-- Google tag (gtag.js) --&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l24&quot; &gt;Line 24:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 24:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [[Mixture-of-Experts (MoE)]] ... [[Mistral]]&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [[Mixture-of-Experts (MoE)]] ... [[Mistral]]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://arxiv.org/abs/2312.00752 Mamba: Linear-Time Sequence Modeling with Selective State Spaces | Albert Gu, Tri Dao]&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://arxiv.org/abs/2312.00752 Mamba: Linear-Time Sequence Modeling with Selective State Spaces | Albert Gu, Tri Dao]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;−&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [[Large Language Model (LLM)]] ... [[Large Language Model (LLM)#Multimodal|Multimodal]] ... [[Foundation Models (FM)]] ... [[Generative Pre-trained Transformer (GPT)|Generative Pre-trained]] ... [[Transformer]] &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;... [[GPT-4]] ... [[GPT-5]] &lt;/del&gt;... [[Attention]] ... [[Generative Adversarial Network (GAN)|GAN]] ... [[Bidirectional Encoder Representations from Transformers (BERT)|BERT]]&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [[Large Language Model (LLM)]] ... [[Large Language Model (LLM)#Multimodal|Multimodal]] ... [[Foundation Models (FM)]] ... [[Generative Pre-trained Transformer (GPT)|Generative Pre-trained]] ... [[Transformer]] &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt; &lt;/ins&gt;... [[Attention]] ... [[Generative Adversarial Network (GAN)|GAN]] ... [[Bidirectional Encoder Representations from Transformers (BERT)|BERT]]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [[Natural Language Processing (NLP)]] ... [[Natural Language Generation (NLG)|Generation (NLG)]] ... [[Natural Language Classification (NLC)|Classification (NLC)]] ... [[Natural Language Processing (NLP)#Natural Language Understanding (NLU)|Understanding (NLU)]] ... [[Language Translation|Translation]] ... [[Summarization]] ... [[Sentiment Analysis|Sentiment]] ... [[Natural Language Tools &amp;amp; Services|Tools]]&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [[Natural Language Processing (NLP)]] ... [[Natural Language Generation (NLG)|Generation (NLG)]] ... [[Natural Language Classification (NLC)|Classification (NLC)]] ... [[Natural Language Processing (NLP)#Natural Language Understanding (NLU)|Understanding (NLU)]] ... [[Language Translation|Translation]] ... [[Summarization]] ... [[Sentiment Analysis|Sentiment]] ... [[Natural Language Tools &amp;amp; Services|Tools]]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://en.wikipedia.org/wiki/State-space_representation State-space representation | Wikipedia]&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://en.wikipedia.org/wiki/State-space_representation State-space representation | Wikipedia]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l76&quot; &gt;Line 76:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 76:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;= State Space vs Transformer Models =&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;= State Space vs Transformer Models =&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;−&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;del style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;* [[Large Language Model (LLM)]] ... [[Large Language Model (LLM)#Multimodal|Multimodal]] ... [[Foundation Models (FM)]] ... [[Generative Pre-trained Transformer (GPT)|Generative Pre-trained]] ... [[Transformer]] ... [[GPT-4]] ... [[GPT-5]] ... [[Attention]] ... [[Generative Adversarial Network (GAN)|GAN]] ... [[Bidirectional Encoder Representations from Transformers (BERT)|BERT]]&lt;/del&gt;&lt;/div&gt;&lt;/td&gt;&lt;td colspan=&quot;2&quot;&gt;&amp;#160;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;We just did an episode on the first 90 days of [[Mamba]] literature. And one of the things that is really interesting about this new mechanism, the &amp;lt;b&amp;gt;selective state space mechanism&amp;lt;/b&amp;gt;. Is that it does have different strengths and weaknesses compared to the [[Attention]] mechanism. Both in terms of how much memory it consumes where [[Attention]] mechanism is quadratic but the length of the input and that might be, by the way, one of the reasons like just as you talk about a torrent of data and a 1,000 samples per second, if that were to be naively translated to a 1,000 tokens per second. Then very quickly you&amp;#039;re getting to a level of tokens that we have only very recently reached with frontier grade [[Transformer|Transformers]]. It was only with [[GPT-4]] a year ago that the public first got to see a quality 8,000 token [[Transformer]]. And before that it was like just a couple months where we had just seen the 4,000 before that as of like 18 months ago, 2,000 tokens was what you could really get from like the [[OpenAI]] API. So just the sheer volume of data may not limit itself super well to the [[Transformer]] but also another other When they break down these micro tasks. And look at what the [[Transformer]] can do and can&amp;#039;t do one of the things that really struggles on is the hyper noisy environment there. There was a interesting result in this one. [[Mamba]] versus [[Transformer]] comparison paper, it&amp;#039;s more about the &amp;lt;b&amp;gt;state space mechanism&amp;lt;/b&amp;gt; and the [[Attention]] mechanism. Those are really the two things that are more dueling it out than the higher level [[architectures]]. And they&amp;#039;re not even dueling it out because they actually work best together - spoiler. But in the super noisy environment where what actually matters is quite rare in what you&amp;#039;re signaling, then the [[Transformer]] sometimes has a hard time converging and the intuition I&amp;#039;ve developed for that is because it&amp;#039;s changing all the weights at the same time across like the entire range. Put it may be that the [[Gradient Descent Optimization &amp;amp; Challenges |gradient]] is often dominated by noise and has a hard time converging on the signal. Whereas When I don&amp;#039;t want to make everything about the &amp;lt;b&amp;gt;selective state space model&amp;lt;/b&amp;gt;, so I do have an obsession about this as folks know. It is updating per token and so it seems like it has a more natural mechanism when the actual signal hits to say. Oh, and this is where I start to violate my anthropomorphizing policy but it has an ability to recognize when the signal hits and update in a more focused way on that one thing that really was supposed to matter, whereas the [[Transformer]] is updating everything all across, it&amp;#039;s considering everything at once. And so, it seems like the signal can get lost in all that noise, the recurrent nature of the &amp;lt;b&amp;gt;selective state space&amp;lt;/b&amp;gt; mechanism. ... to zero in and do the [[Gradient Descent Optimization &amp;amp; Challenges |gradient]] on the signal when you have the signal and then of course there&amp;#039;s still a lot of noise but that maybe can get separated from the signal because of this bit by bit level processing and updating. I&amp;#039;m not 100% just confident in that theory but it is consistent with all the evidence that I know of so far. [https://www.cognitiverevolution.ai/ Nick Labenz - The Cognitive Revolution]&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;We just did an episode on the first 90 days of [[Mamba]] literature. And one of the things that is really interesting about this new mechanism, the &amp;lt;b&amp;gt;selective state space mechanism&amp;lt;/b&amp;gt;. Is that it does have different strengths and weaknesses compared to the [[Attention]] mechanism. Both in terms of how much memory it consumes where [[Attention]] mechanism is quadratic but the length of the input and that might be, by the way, one of the reasons like just as you talk about a torrent of data and a 1,000 samples per second, if that were to be naively translated to a 1,000 tokens per second. Then very quickly you&amp;#039;re getting to a level of tokens that we have only very recently reached with frontier grade [[Transformer|Transformers]]. It was only with [[GPT-4]] a year ago that the public first got to see a quality 8,000 token [[Transformer]]. And before that it was like just a couple months where we had just seen the 4,000 before that as of like 18 months ago, 2,000 tokens was what you could really get from like the [[OpenAI]] API. So just the sheer volume of data may not limit itself super well to the [[Transformer]] but also another other When they break down these micro tasks. And look at what the [[Transformer]] can do and can&amp;#039;t do one of the things that really struggles on is the hyper noisy environment there. There was a interesting result in this one. [[Mamba]] versus [[Transformer]] comparison paper, it&amp;#039;s more about the &amp;lt;b&amp;gt;state space mechanism&amp;lt;/b&amp;gt; and the [[Attention]] mechanism. Those are really the two things that are more dueling it out than the higher level [[architectures]]. And they&amp;#039;re not even dueling it out because they actually work best together - spoiler. But in the super noisy environment where what actually matters is quite rare in what you&amp;#039;re signaling, then the [[Transformer]] sometimes has a hard time converging and the intuition I&amp;#039;ve developed for that is because it&amp;#039;s changing all the weights at the same time across like the entire range. Put it may be that the [[Gradient Descent Optimization &amp;amp; Challenges |gradient]] is often dominated by noise and has a hard time converging on the signal. Whereas When I don&amp;#039;t want to make everything about the &amp;lt;b&amp;gt;selective state space model&amp;lt;/b&amp;gt;, so I do have an obsession about this as folks know. It is updating per token and so it seems like it has a more natural mechanism when the actual signal hits to say. Oh, and this is where I start to violate my anthropomorphizing policy but it has an ability to recognize when the signal hits and update in a more focused way on that one thing that really was supposed to matter, whereas the [[Transformer]] is updating everything all across, it&amp;#039;s considering everything at once. And so, it seems like the signal can get lost in all that noise, the recurrent nature of the &amp;lt;b&amp;gt;selective state space&amp;lt;/b&amp;gt; mechanism. ... to zero in and do the [[Gradient Descent Optimization &amp;amp; Challenges |gradient]] on the signal when you have the signal and then of course there&amp;#039;s still a lot of noise but that maybe can get separated from the signal because of this bit by bit level processing and updating. I&amp;#039;m not 100% just confident in that theory but it is consistent with all the evidence that I know of so far. [https://www.cognitiverevolution.ai/ Nick Labenz - The Cognitive Revolution]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;youtube&amp;gt;dRxolamy-NA&amp;lt;/youtube&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;youtube&amp;gt;dRxolamy-NA&amp;lt;/youtube&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>BPeat</name></author>
		
	</entry>
	<entry>
		<id>https://primo.ai/index.php?title=State_Space_Model_(SSM)&amp;diff=37055&amp;oldid=prev</id>
		<title>BPeat: /* State Space vs Transformer Models */</title>
		<link rel="alternate" type="text/html" href="https://primo.ai/index.php?title=State_Space_Model_(SSM)&amp;diff=37055&amp;oldid=prev"/>
		<updated>2024-05-01T00:57:28Z</updated>

		<summary type="html">&lt;p&gt;‎&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;State Space vs Transformer Models&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 00:57, 1 May 2024&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l76&quot; &gt;Line 76:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 76:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;= State Space vs Transformer Models =&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;= State Space vs Transformer Models =&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt;&amp;#160;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;* [[Large Language Model (LLM)]] ... [[Large Language Model (LLM)#Multimodal|Multimodal]] ... [[Foundation Models (FM)]] ... [[Generative Pre-trained Transformer (GPT)|Generative Pre-trained]] ... [[Transformer]] ... [[GPT-4]] ... [[GPT-5]] ... [[Attention]] ... [[Generative Adversarial Network (GAN)|GAN]] ... [[Bidirectional Encoder Representations from Transformers (BERT)|BERT]]&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt;&amp;#160;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;We just did an episode on the first 90 days of [[Mamba]] literature. And one of the things that is really interesting about this new mechanism, the &amp;lt;b&amp;gt;selective state space mechanism&amp;lt;/b&amp;gt;. Is that it does have different strengths and weaknesses compared to the [[Attention]] mechanism. Both in terms of how much memory it consumes where [[Attention]] mechanism is quadratic but the length of the input and that might be, by the way, one of the reasons like just as you talk about a torrent of data and a 1,000 samples per second, if that were to be naively translated to a 1,000 tokens per second. Then very quickly you&amp;#039;re getting to a level of tokens that we have only very recently reached with frontier grade [[Transformer|Transformers]]. It was only with [[GPT-4]] a year ago that the public first got to see a quality 8,000 token [[Transformer]]. And before that it was like just a couple months where we had just seen the 4,000 before that as of like 18 months ago, 2,000 tokens was what you could really get from like the [[OpenAI]] API. So just the sheer volume of data may not limit itself super well to the [[Transformer]] but also another other When they break down these micro tasks. And look at what the [[Transformer]] can do and can&amp;#039;t do one of the things that really struggles on is the hyper noisy environment there. There was a interesting result in this one. [[Mamba]] versus [[Transformer]] comparison paper, it&amp;#039;s more about the &amp;lt;b&amp;gt;state space mechanism&amp;lt;/b&amp;gt; and the [[Attention]] mechanism. Those are really the two things that are more dueling it out than the higher level [[architectures]]. And they&amp;#039;re not even dueling it out because they actually work best together - spoiler. But in the super noisy environment where what actually matters is quite rare in what you&amp;#039;re signaling, then the [[Transformer]] sometimes has a hard time converging and the intuition I&amp;#039;ve developed for that is because it&amp;#039;s changing all the weights at the same time across like the entire range. Put it may be that the [[Gradient Descent Optimization &amp;amp; Challenges |gradient]] is often dominated by noise and has a hard time converging on the signal. Whereas When I don&amp;#039;t want to make everything about the &amp;lt;b&amp;gt;selective state space model&amp;lt;/b&amp;gt;, so I do have an obsession about this as folks know. It is updating per token and so it seems like it has a more natural mechanism when the actual signal hits to say. Oh, and this is where I start to violate my anthropomorphizing policy but it has an ability to recognize when the signal hits and update in a more focused way on that one thing that really was supposed to matter, whereas the [[Transformer]] is updating everything all across, it&amp;#039;s considering everything at once. And so, it seems like the signal can get lost in all that noise, the recurrent nature of the &amp;lt;b&amp;gt;selective state space&amp;lt;/b&amp;gt; mechanism. ... to zero in and do the [[Gradient Descent Optimization &amp;amp; Challenges |gradient]] on the signal when you have the signal and then of course there&amp;#039;s still a lot of noise but that maybe can get separated from the signal because of this bit by bit level processing and updating. I&amp;#039;m not 100% just confident in that theory but it is consistent with all the evidence that I know of so far. [https://www.cognitiverevolution.ai/ Nick Labenz - The Cognitive Revolution]&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;We just did an episode on the first 90 days of [[Mamba]] literature. And one of the things that is really interesting about this new mechanism, the &amp;lt;b&amp;gt;selective state space mechanism&amp;lt;/b&amp;gt;. Is that it does have different strengths and weaknesses compared to the [[Attention]] mechanism. Both in terms of how much memory it consumes where [[Attention]] mechanism is quadratic but the length of the input and that might be, by the way, one of the reasons like just as you talk about a torrent of data and a 1,000 samples per second, if that were to be naively translated to a 1,000 tokens per second. Then very quickly you&amp;#039;re getting to a level of tokens that we have only very recently reached with frontier grade [[Transformer|Transformers]]. It was only with [[GPT-4]] a year ago that the public first got to see a quality 8,000 token [[Transformer]]. And before that it was like just a couple months where we had just seen the 4,000 before that as of like 18 months ago, 2,000 tokens was what you could really get from like the [[OpenAI]] API. So just the sheer volume of data may not limit itself super well to the [[Transformer]] but also another other When they break down these micro tasks. And look at what the [[Transformer]] can do and can&amp;#039;t do one of the things that really struggles on is the hyper noisy environment there. There was a interesting result in this one. [[Mamba]] versus [[Transformer]] comparison paper, it&amp;#039;s more about the &amp;lt;b&amp;gt;state space mechanism&amp;lt;/b&amp;gt; and the [[Attention]] mechanism. Those are really the two things that are more dueling it out than the higher level [[architectures]]. And they&amp;#039;re not even dueling it out because they actually work best together - spoiler. But in the super noisy environment where what actually matters is quite rare in what you&amp;#039;re signaling, then the [[Transformer]] sometimes has a hard time converging and the intuition I&amp;#039;ve developed for that is because it&amp;#039;s changing all the weights at the same time across like the entire range. Put it may be that the [[Gradient Descent Optimization &amp;amp; Challenges |gradient]] is often dominated by noise and has a hard time converging on the signal. Whereas When I don&amp;#039;t want to make everything about the &amp;lt;b&amp;gt;selective state space model&amp;lt;/b&amp;gt;, so I do have an obsession about this as folks know. It is updating per token and so it seems like it has a more natural mechanism when the actual signal hits to say. Oh, and this is where I start to violate my anthropomorphizing policy but it has an ability to recognize when the signal hits and update in a more focused way on that one thing that really was supposed to matter, whereas the [[Transformer]] is updating everything all across, it&amp;#039;s considering everything at once. And so, it seems like the signal can get lost in all that noise, the recurrent nature of the &amp;lt;b&amp;gt;selective state space&amp;lt;/b&amp;gt; mechanism. ... to zero in and do the [[Gradient Descent Optimization &amp;amp; Challenges |gradient]] on the signal when you have the signal and then of course there&amp;#039;s still a lot of noise but that maybe can get separated from the signal because of this bit by bit level processing and updating. I&amp;#039;m not 100% just confident in that theory but it is consistent with all the evidence that I know of so far. [https://www.cognitiverevolution.ai/ Nick Labenz - The Cognitive Revolution]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;youtube&amp;gt;dRxolamy-NA&amp;lt;/youtube&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;youtube&amp;gt;dRxolamy-NA&amp;lt;/youtube&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>BPeat</name></author>
		
	</entry>
	<entry>
		<id>https://primo.ai/index.php?title=State_Space_Model_(SSM)&amp;diff=37054&amp;oldid=prev</id>
		<title>BPeat: /* State Space vs transformer Models */</title>
		<link rel="alternate" type="text/html" href="https://primo.ai/index.php?title=State_Space_Model_(SSM)&amp;diff=37054&amp;oldid=prev"/>
		<updated>2024-05-01T00:56:17Z</updated>

		<summary type="html">&lt;p&gt;‎&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;State Space vs transformer Models&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 00:56, 1 May 2024&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l75&quot; &gt;Line 75:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 75:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;−&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;= State Space vs &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;transformer &lt;/del&gt;Models =&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;= State Space vs &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;Transformer &lt;/ins&gt;Models =&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;We just did an episode on the first 90 days of [[Mamba]] literature. And one of the things that is really interesting about this new mechanism, the &amp;lt;b&amp;gt;selective state space mechanism&amp;lt;/b&amp;gt;. Is that it does have different strengths and weaknesses compared to the [[Attention]] mechanism. Both in terms of how much memory it consumes where [[Attention]] mechanism is quadratic but the length of the input and that might be, by the way, one of the reasons like just as you talk about a torrent of data and a 1,000 samples per second, if that were to be naively translated to a 1,000 tokens per second. Then very quickly you&amp;#039;re getting to a level of tokens that we have only very recently reached with frontier grade [[Transformer|Transformers]]. It was only with [[GPT-4]] a year ago that the public first got to see a quality 8,000 token [[Transformer]]. And before that it was like just a couple months where we had just seen the 4,000 before that as of like 18 months ago, 2,000 tokens was what you could really get from like the [[OpenAI]] API. So just the sheer volume of data may not limit itself super well to the [[Transformer]] but also another other When they break down these micro tasks. And look at what the [[Transformer]] can do and can&amp;#039;t do one of the things that really struggles on is the hyper noisy environment there. There was a interesting result in this one. [[Mamba]] versus [[Transformer]] comparison paper, it&amp;#039;s more about the &amp;lt;b&amp;gt;state space mechanism&amp;lt;/b&amp;gt; and the [[Attention]] mechanism. Those are really the two things that are more dueling it out than the higher level [[architectures]]. And they&amp;#039;re not even dueling it out because they actually work best together - spoiler. But in the super noisy environment where what actually matters is quite rare in what you&amp;#039;re signaling, then the [[Transformer]] sometimes has a hard time converging and the intuition I&amp;#039;ve developed for that is because it&amp;#039;s changing all the weights at the same time across like the entire range. Put it may be that the [[Gradient Descent Optimization &amp;amp; Challenges |gradient]] is often dominated by noise and has a hard time converging on the signal. Whereas When I don&amp;#039;t want to make everything about the &amp;lt;b&amp;gt;selective state space model&amp;lt;/b&amp;gt;, so I do have an obsession about this as folks know. It is updating per token and so it seems like it has a more natural mechanism when the actual signal hits to say. Oh, and this is where I start to violate my anthropomorphizing policy but it has an ability to recognize when the signal hits and update in a more focused way on that one thing that really was supposed to matter, whereas the [[Transformer]] is updating everything all across, it&amp;#039;s considering everything at once. And so, it seems like the signal can get lost in all that noise, the recurrent nature of the &amp;lt;b&amp;gt;selective state space&amp;lt;/b&amp;gt; mechanism. ... to zero in and do the [[Gradient Descent Optimization &amp;amp; Challenges |gradient]] on the signal when you have the signal and then of course there&amp;#039;s still a lot of noise but that maybe can get separated from the signal because of this bit by bit level processing and updating. I&amp;#039;m not 100% just confident in that theory but it is consistent with all the evidence that I know of so far. [https://www.cognitiverevolution.ai/ Nick Labenz - The Cognitive Revolution]&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;We just did an episode on the first 90 days of [[Mamba]] literature. And one of the things that is really interesting about this new mechanism, the &amp;lt;b&amp;gt;selective state space mechanism&amp;lt;/b&amp;gt;. Is that it does have different strengths and weaknesses compared to the [[Attention]] mechanism. Both in terms of how much memory it consumes where [[Attention]] mechanism is quadratic but the length of the input and that might be, by the way, one of the reasons like just as you talk about a torrent of data and a 1,000 samples per second, if that were to be naively translated to a 1,000 tokens per second. Then very quickly you&amp;#039;re getting to a level of tokens that we have only very recently reached with frontier grade [[Transformer|Transformers]]. It was only with [[GPT-4]] a year ago that the public first got to see a quality 8,000 token [[Transformer]]. And before that it was like just a couple months where we had just seen the 4,000 before that as of like 18 months ago, 2,000 tokens was what you could really get from like the [[OpenAI]] API. So just the sheer volume of data may not limit itself super well to the [[Transformer]] but also another other When they break down these micro tasks. And look at what the [[Transformer]] can do and can&amp;#039;t do one of the things that really struggles on is the hyper noisy environment there. There was a interesting result in this one. [[Mamba]] versus [[Transformer]] comparison paper, it&amp;#039;s more about the &amp;lt;b&amp;gt;state space mechanism&amp;lt;/b&amp;gt; and the [[Attention]] mechanism. Those are really the two things that are more dueling it out than the higher level [[architectures]]. And they&amp;#039;re not even dueling it out because they actually work best together - spoiler. But in the super noisy environment where what actually matters is quite rare in what you&amp;#039;re signaling, then the [[Transformer]] sometimes has a hard time converging and the intuition I&amp;#039;ve developed for that is because it&amp;#039;s changing all the weights at the same time across like the entire range. Put it may be that the [[Gradient Descent Optimization &amp;amp; Challenges |gradient]] is often dominated by noise and has a hard time converging on the signal. Whereas When I don&amp;#039;t want to make everything about the &amp;lt;b&amp;gt;selective state space model&amp;lt;/b&amp;gt;, so I do have an obsession about this as folks know. It is updating per token and so it seems like it has a more natural mechanism when the actual signal hits to say. Oh, and this is where I start to violate my anthropomorphizing policy but it has an ability to recognize when the signal hits and update in a more focused way on that one thing that really was supposed to matter, whereas the [[Transformer]] is updating everything all across, it&amp;#039;s considering everything at once. And so, it seems like the signal can get lost in all that noise, the recurrent nature of the &amp;lt;b&amp;gt;selective state space&amp;lt;/b&amp;gt; mechanism. ... to zero in and do the [[Gradient Descent Optimization &amp;amp; Challenges |gradient]] on the signal when you have the signal and then of course there&amp;#039;s still a lot of noise but that maybe can get separated from the signal because of this bit by bit level processing and updating. I&amp;#039;m not 100% just confident in that theory but it is consistent with all the evidence that I know of so far. [https://www.cognitiverevolution.ai/ Nick Labenz - The Cognitive Revolution]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;youtube&amp;gt;dRxolamy-NA&amp;lt;/youtube&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;youtube&amp;gt;dRxolamy-NA&amp;lt;/youtube&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>BPeat</name></author>
		
	</entry>
	<entry>
		<id>https://primo.ai/index.php?title=State_Space_Model_(SSM)&amp;diff=37053&amp;oldid=prev</id>
		<title>BPeat: /* Selective State Space vs transformer Models */</title>
		<link rel="alternate" type="text/html" href="https://primo.ai/index.php?title=State_Space_Model_(SSM)&amp;diff=37053&amp;oldid=prev"/>
		<updated>2024-05-01T00:29:33Z</updated>

		<summary type="html">&lt;p&gt;‎&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;Selective State Space vs transformer Models&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 00:29, 1 May 2024&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l75&quot; &gt;Line 75:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 75:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;−&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;= &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;Selective &lt;/del&gt;State Space vs transformer Models =&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;= State Space vs transformer Models =&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;We just did an episode on the first 90 days of [[Mamba]] literature. And one of the things that is really interesting about this new mechanism, the &amp;lt;b&amp;gt;selective state space mechanism&amp;lt;/b&amp;gt;. Is that it does have different strengths and weaknesses compared to the [[Attention]] mechanism. Both in terms of how much memory it consumes where [[Attention]] mechanism is quadratic but the length of the input and that might be, by the way, one of the reasons like just as you talk about a torrent of data and a 1,000 samples per second, if that were to be naively translated to a 1,000 tokens per second. Then very quickly you&amp;#039;re getting to a level of tokens that we have only very recently reached with frontier grade [[Transformer|Transformers]]. It was only with [[GPT-4]] a year ago that the public first got to see a quality 8,000 token [[Transformer]]. And before that it was like just a couple months where we had just seen the 4,000 before that as of like 18 months ago, 2,000 tokens was what you could really get from like the [[OpenAI]] API. So just the sheer volume of data may not limit itself super well to the [[Transformer]] but also another other When they break down these micro tasks. And look at what the [[Transformer]] can do and can&amp;#039;t do one of the things that really struggles on is the hyper noisy environment there. There was a interesting result in this one. [[Mamba]] versus [[Transformer]] comparison paper, it&amp;#039;s more about the &amp;lt;b&amp;gt;state space mechanism&amp;lt;/b&amp;gt; and the [[Attention]] mechanism. Those are really the two things that are more dueling it out than the higher level [[architectures]]. And they&amp;#039;re not even dueling it out because they actually work best together - spoiler. But in the super noisy environment where what actually matters is quite rare in what you&amp;#039;re signaling, then the [[Transformer]] sometimes has a hard time converging and the intuition I&amp;#039;ve developed for that is because it&amp;#039;s changing all the weights at the same time across like the entire range. Put it may be that the [[Gradient Descent Optimization &amp;amp; Challenges |gradient]] is often dominated by noise and has a hard time converging on the signal. Whereas When I don&amp;#039;t want to make everything about the &amp;lt;b&amp;gt;selective state space model&amp;lt;/b&amp;gt;, so I do have an obsession about this as folks know. It is updating per token and so it seems like it has a more natural mechanism when the actual signal hits to say. Oh, and this is where I start to violate my anthropomorphizing policy but it has an ability to recognize when the signal hits and update in a more focused way on that one thing that really was supposed to matter, whereas the [[Transformer]] is updating everything all across, it&amp;#039;s considering everything at once. And so, it seems like the signal can get lost in all that noise, the recurrent nature of the &amp;lt;b&amp;gt;selective state space&amp;lt;/b&amp;gt; mechanism. ... to zero in and do the [[Gradient Descent Optimization &amp;amp; Challenges |gradient]] on the signal when you have the signal and then of course there&amp;#039;s still a lot of noise but that maybe can get separated from the signal because of this bit by bit level processing and updating. I&amp;#039;m not 100% just confident in that theory but it is consistent with all the evidence that I know of so far. [https://www.cognitiverevolution.ai/ Nick Labenz - The Cognitive Revolution]&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;We just did an episode on the first 90 days of [[Mamba]] literature. And one of the things that is really interesting about this new mechanism, the &amp;lt;b&amp;gt;selective state space mechanism&amp;lt;/b&amp;gt;. Is that it does have different strengths and weaknesses compared to the [[Attention]] mechanism. Both in terms of how much memory it consumes where [[Attention]] mechanism is quadratic but the length of the input and that might be, by the way, one of the reasons like just as you talk about a torrent of data and a 1,000 samples per second, if that were to be naively translated to a 1,000 tokens per second. Then very quickly you&amp;#039;re getting to a level of tokens that we have only very recently reached with frontier grade [[Transformer|Transformers]]. It was only with [[GPT-4]] a year ago that the public first got to see a quality 8,000 token [[Transformer]]. And before that it was like just a couple months where we had just seen the 4,000 before that as of like 18 months ago, 2,000 tokens was what you could really get from like the [[OpenAI]] API. So just the sheer volume of data may not limit itself super well to the [[Transformer]] but also another other When they break down these micro tasks. And look at what the [[Transformer]] can do and can&amp;#039;t do one of the things that really struggles on is the hyper noisy environment there. There was a interesting result in this one. [[Mamba]] versus [[Transformer]] comparison paper, it&amp;#039;s more about the &amp;lt;b&amp;gt;state space mechanism&amp;lt;/b&amp;gt; and the [[Attention]] mechanism. Those are really the two things that are more dueling it out than the higher level [[architectures]]. And they&amp;#039;re not even dueling it out because they actually work best together - spoiler. But in the super noisy environment where what actually matters is quite rare in what you&amp;#039;re signaling, then the [[Transformer]] sometimes has a hard time converging and the intuition I&amp;#039;ve developed for that is because it&amp;#039;s changing all the weights at the same time across like the entire range. Put it may be that the [[Gradient Descent Optimization &amp;amp; Challenges |gradient]] is often dominated by noise and has a hard time converging on the signal. Whereas When I don&amp;#039;t want to make everything about the &amp;lt;b&amp;gt;selective state space model&amp;lt;/b&amp;gt;, so I do have an obsession about this as folks know. It is updating per token and so it seems like it has a more natural mechanism when the actual signal hits to say. Oh, and this is where I start to violate my anthropomorphizing policy but it has an ability to recognize when the signal hits and update in a more focused way on that one thing that really was supposed to matter, whereas the [[Transformer]] is updating everything all across, it&amp;#039;s considering everything at once. And so, it seems like the signal can get lost in all that noise, the recurrent nature of the &amp;lt;b&amp;gt;selective state space&amp;lt;/b&amp;gt; mechanism. ... to zero in and do the [[Gradient Descent Optimization &amp;amp; Challenges |gradient]] on the signal when you have the signal and then of course there&amp;#039;s still a lot of noise but that maybe can get separated from the signal because of this bit by bit level processing and updating. I&amp;#039;m not 100% just confident in that theory but it is consistent with all the evidence that I know of so far. [https://www.cognitiverevolution.ai/ Nick Labenz - The Cognitive Revolution]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;youtube&amp;gt;dRxolamy-NA&amp;lt;/youtube&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;youtube&amp;gt;dRxolamy-NA&amp;lt;/youtube&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>BPeat</name></author>
		
	</entry>
	<entry>
		<id>https://primo.ai/index.php?title=State_Space_Model_(SSM)&amp;diff=37052&amp;oldid=prev</id>
		<title>BPeat: /* Selective State Space vs transformer Models */</title>
		<link rel="alternate" type="text/html" href="https://primo.ai/index.php?title=State_Space_Model_(SSM)&amp;diff=37052&amp;oldid=prev"/>
		<updated>2024-05-01T00:28:56Z</updated>

		<summary type="html">&lt;p&gt;‎&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;Selective State Space vs transformer Models&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 00:28, 1 May 2024&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l76&quot; &gt;Line 76:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 76:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;= Selective State Space vs transformer Models =&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;= Selective State Space vs transformer Models =&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;−&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;We just did an episode on the first 90 days of [[Mamba]] literature. And one of the things that is really interesting about this new mechanism, the selective state space mechanism. Is that it does have different strengths and weaknesses compared to the [[Attention]] mechanism. Both in terms of how much memory it consumes where [[Attention]] mechanism is quadratic but the length of the input and that might be, by the way, one of the reasons like just as you talk about a torrent of data and a 1,000 samples per second, if that were to be naively translated to a 1,000 tokens per second. Then very quickly you&amp;#039;re getting to a level of tokens that we have only very recently reached with frontier grade [[Transformer|Transformers]]. It was only with [[GPT-4]] a year ago that the public first got to see a quality 8,000 token [[Transformer]]. And before that it was like just a couple months where we had just seen the 4,000 before that as of like 18 months ago, 2,000 tokens was what you could really get from like the [[OpenAI]] API. So just the sheer volume of data may not limit itself super well to the [[Transformer]] but also another other When they break down these micro tasks. And look at what the [[Transformer]] can do and can&amp;#039;t do one of the things that really struggles on is the hyper noisy environment there. There was a interesting result in this one. [[Mamba]] versus [[Transformer]] comparison paper, it&amp;#039;s more about the state space mechanism and the [[Attention]] mechanism. Those are really the two things that are more dueling it out than the higher level architectures. And they&amp;#039;re not even &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;doing &lt;/del&gt;it out because they actually work best together spoiler &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;but &lt;/del&gt;in the super noisy environment where what actually matters is quite rare in what you&amp;#039;re signaling, then the Transformer sometimes has a hard time converging and the intuition I&amp;#039;ve developed for that is because it&amp;#039;s changing all the weights at the same time across like the entire range&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;. Range&lt;/del&gt;. Put it may be that the gradient is often dominated by noise and has a hard time converging on the signal. Whereas When I don&amp;#039;t want to make everything about the selective state space model, so I do have an obsession about this as folks know. It is updating per token and so it seems like it has a more natural mechanism when the actual signal hits to say. Oh, and this is where I start to violate my &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;noanthropomorphizing &lt;/del&gt;policy but it has an ability to recognize &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;When &lt;/del&gt;the signal hits and update in a more focused way on that one thing that really was supposed to matter, whereas the Transformer is updating everything all across, it&amp;#039;s considering everything at once. And so, it seems like the signal can get lost in all that noise, the recurrent nature of the selective &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;State &lt;/del&gt;space. &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;Mechanism&lt;/del&gt;. &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;Oh, I see &lt;/del&gt;to &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;kind of &lt;/del&gt;zero in and do the gradient on the signal when you have the signal and then of course there&amp;#039;s still a lot of noise but that maybe can get separated from the signal because of this bit by bit level processing and updating. I&amp;#039;m not 100 just confident in that theory but it is consistent with all the evidence that I know of so far. [https://www.cognitiverevolution.ai/ Nick Labenz - The Cognitive Revolution]&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;We just did an episode on the first 90 days of [[Mamba]] literature. And one of the things that is really interesting about this new mechanism, the &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&amp;lt;b&amp;gt;&lt;/ins&gt;selective state space mechanism&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&amp;lt;/b&amp;gt;&lt;/ins&gt;. Is that it does have different strengths and weaknesses compared to the [[Attention]] mechanism. Both in terms of how much memory it consumes where [[Attention]] mechanism is quadratic but the length of the input and that might be, by the way, one of the reasons like just as you talk about a torrent of data and a 1,000 samples per second, if that were to be naively translated to a 1,000 tokens per second. Then very quickly you&amp;#039;re getting to a level of tokens that we have only very recently reached with frontier grade [[Transformer|Transformers]]. It was only with [[GPT-4]] a year ago that the public first got to see a quality 8,000 token [[Transformer]]. And before that it was like just a couple months where we had just seen the 4,000 before that as of like 18 months ago, 2,000 tokens was what you could really get from like the [[OpenAI]] API. So just the sheer volume of data may not limit itself super well to the [[Transformer]] but also another other When they break down these micro tasks. And look at what the [[Transformer]] can do and can&amp;#039;t do one of the things that really struggles on is the hyper noisy environment there. There was a interesting result in this one. [[Mamba]] versus [[Transformer]] comparison paper, it&amp;#039;s more about the &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&amp;lt;b&amp;gt;&lt;/ins&gt;state space mechanism&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&amp;lt;/b&amp;gt; &lt;/ins&gt;and the [[Attention]] mechanism. Those are really the two things that are more dueling it out than the higher level &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;[[&lt;/ins&gt;architectures&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;]]&lt;/ins&gt;. And they&amp;#039;re not even &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;dueling &lt;/ins&gt;it out because they actually work best together &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;- &lt;/ins&gt;spoiler&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;. But &lt;/ins&gt;in the super noisy environment where what actually matters is quite rare in what you&amp;#039;re signaling, then the &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;[[&lt;/ins&gt;Transformer&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;]] &lt;/ins&gt;sometimes has a hard time converging and the intuition I&amp;#039;ve developed for that is because it&amp;#039;s changing all the weights at the same time across like the entire range. Put it may be that the &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;[[Gradient Descent Optimization &amp;amp; Challenges |&lt;/ins&gt;gradient&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;]] &lt;/ins&gt;is often dominated by noise and has a hard time converging on the signal. Whereas When I don&amp;#039;t want to make everything about the &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&amp;lt;b&amp;gt;&lt;/ins&gt;selective state space model&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&amp;lt;/b&amp;gt;&lt;/ins&gt;, so I do have an obsession about this as folks know. It is updating per token and so it seems like it has a more natural mechanism when the actual signal hits to say. Oh, and this is where I start to violate my &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;anthropomorphizing &lt;/ins&gt;policy but it has an ability to recognize &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;when &lt;/ins&gt;the signal hits and update in a more focused way on that one thing that really was supposed to matter, whereas the &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;[[&lt;/ins&gt;Transformer&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;]] &lt;/ins&gt;is updating everything all across, it&amp;#039;s considering everything at once. And so, it seems like the signal can get lost in all that noise, the recurrent nature of the &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&amp;lt;b&amp;gt;&lt;/ins&gt;selective &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;state &lt;/ins&gt;space&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&amp;lt;/b&amp;gt; mechanism. .&lt;/ins&gt;.. to zero in and do the &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;[[Gradient Descent Optimization &amp;amp; Challenges |&lt;/ins&gt;gradient&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;]] &lt;/ins&gt;on the signal when you have the signal and then of course there&amp;#039;s still a lot of noise but that maybe can get separated from the signal because of this bit by bit level processing and updating. I&amp;#039;m not 100&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;% &lt;/ins&gt;just confident in that theory but it is consistent with all the evidence that I know of so far. [https://www.cognitiverevolution.ai/ Nick Labenz - The Cognitive Revolution]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;youtube&amp;gt;dRxolamy-NA&amp;lt;/youtube&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;youtube&amp;gt;dRxolamy-NA&amp;lt;/youtube&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>BPeat</name></author>
		
	</entry>
	<entry>
		<id>https://primo.ai/index.php?title=State_Space_Model_(SSM)&amp;diff=37051&amp;oldid=prev</id>
		<title>BPeat: /* Selective State Space vs transformer Models */</title>
		<link rel="alternate" type="text/html" href="https://primo.ai/index.php?title=State_Space_Model_(SSM)&amp;diff=37051&amp;oldid=prev"/>
		<updated>2024-05-01T00:20:18Z</updated>

		<summary type="html">&lt;p&gt;‎&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;Selective State Space vs transformer Models&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 00:20, 1 May 2024&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l76&quot; &gt;Line 76:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 76:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;= Selective State Space vs transformer Models =&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;= Selective State Space vs transformer Models =&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;−&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;We just did an episode on the first 90 days of [[Mamba]] literature. And one of the things that is really interesting about this new mechanism, the selective state space mechanism. Is that it does have different strengths and weaknesses compared to the &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;attention &lt;/del&gt;mechanism. Both in terms of how much memory it consumes where &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;attention &lt;/del&gt;mechanism is quadratic but the length of the input and that might be, by the way, one of the reasons like just as you talk about &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;like &lt;/del&gt;a torrent of data and a &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;thousand &lt;/del&gt;samples per second, if that were to be naively translated to a &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;thousand &lt;/del&gt;tokens per second. Then very quickly you&amp;#039;re getting to a level of tokens that we have only very recently reached with &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;Frontier &lt;/del&gt;grade Transformers. It was only with &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;gpt4 &lt;/del&gt;a year ago that the public first&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;. Got &lt;/del&gt;to see a quality 8, 000 token Transformer. And before that it was like just a couple months where we had just seen the &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;four thousand &lt;/del&gt;before that as of like 18 months ago, 2 000&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;, Calcons &lt;/del&gt;was what you could really get from like the &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;open AI &lt;/del&gt;API. So just the sheer volume of data may not limit itself super well to the Transformer but also another other When they break down these micro tasks. And look at what the Transformer can do and can&amp;#039;t do one of the things that really struggles on is the hyper noisy environment there. There was a interesting result in this one. Mamba versus Transformer comparison paper, it&amp;#039;s more about the state space mechanism and the &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;attention &lt;/del&gt;mechanism. Those are really the two things that are more dueling it out than the higher level architectures. And they&amp;#039;re not even doing it out because they actually work best together spoiler but in the super noisy environment where what actually matters is quite rare in what you&amp;#039;re signaling, then the Transformer sometimes has a hard time converging and the intuition I&amp;#039;ve developed for that is because it&amp;#039;s changing all the weights at the same time across like the entire range. Range. Put it may be that the gradient is often dominated by noise and has a hard time converging on the signal. Whereas When I don&amp;#039;t want to make everything about the selective state space model, so I do have an obsession about this as folks know. It is updating per token and so it seems like it has a more natural mechanism when the actual signal hits to say. Oh, and this is where I start to violate my noanthropomorphizing policy but it has an ability to recognize When the signal hits and update in a more focused way on that one thing that really was supposed to matter, whereas the Transformer is updating everything all across, it&amp;#039;s considering everything at once. And so, it seems like the signal can get lost in all that noise, the recurrent nature of the selective State space. Mechanism. Oh, I see to kind of zero in and do the gradient on the signal when you have the signal and then of course there&amp;#039;s still a lot of noise but that maybe can get separated from the signal because of this bit by bit level processing and updating. I&amp;#039;m not 100 just confident in that theory but it is consistent with all the evidence that I know of so far.&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;We just did an episode on the first 90 days of [[Mamba]] literature. And one of the things that is really interesting about this new mechanism, the selective state space mechanism. Is that it does have different strengths and weaknesses compared to the &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;[[Attention]] &lt;/ins&gt;mechanism. Both in terms of how much memory it consumes where &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;[[Attention]] &lt;/ins&gt;mechanism is quadratic but the length of the input and that might be, by the way, one of the reasons like just as you talk about a torrent of data and a &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;1,000 &lt;/ins&gt;samples per second, if that were to be naively translated to a &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;1,000 &lt;/ins&gt;tokens per second. Then very quickly you&amp;#039;re getting to a level of tokens that we have only very recently reached with &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;frontier &lt;/ins&gt;grade &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;[[Transformer|&lt;/ins&gt;Transformers&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;]]&lt;/ins&gt;. It was only with &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;[[GPT-4]] &lt;/ins&gt;a year ago that the public first &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;got &lt;/ins&gt;to see a quality 8,000 token &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;[[&lt;/ins&gt;Transformer&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;]]&lt;/ins&gt;. And before that it was like just a couple months where we had just seen the &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;4,000 &lt;/ins&gt;before that as of like 18 months ago, 2&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;,&lt;/ins&gt;000 &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;tokens &lt;/ins&gt;was what you could really get from like the &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;[[OpenAI]] &lt;/ins&gt;API. So just the sheer volume of data may not limit itself super well to the &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;[[&lt;/ins&gt;Transformer&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;]] &lt;/ins&gt;but also another other When they break down these micro tasks. And look at what the &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;[[&lt;/ins&gt;Transformer&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;]] &lt;/ins&gt;can do and can&amp;#039;t do one of the things that really struggles on is the hyper noisy environment there. There was a interesting result in this one. &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;[[&lt;/ins&gt;Mamba&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;]] &lt;/ins&gt;versus &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;[[&lt;/ins&gt;Transformer&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;]] &lt;/ins&gt;comparison paper, it&amp;#039;s more about the state space mechanism and the &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;[[Attention]] &lt;/ins&gt;mechanism. Those are really the two things that are more dueling it out than the higher level architectures. And they&amp;#039;re not even doing it out because they actually work best together spoiler but in the super noisy environment where what actually matters is quite rare in what you&amp;#039;re signaling, then the Transformer sometimes has a hard time converging and the intuition I&amp;#039;ve developed for that is because it&amp;#039;s changing all the weights at the same time across like the entire range. Range. Put it may be that the gradient is often dominated by noise and has a hard time converging on the signal. Whereas When I don&amp;#039;t want to make everything about the selective state space model, so I do have an obsession about this as folks know. It is updating per token and so it seems like it has a more natural mechanism when the actual signal hits to say. Oh, and this is where I start to violate my noanthropomorphizing policy but it has an ability to recognize When the signal hits and update in a more focused way on that one thing that really was supposed to matter, whereas the Transformer is updating everything all across, it&amp;#039;s considering everything at once. And so, it seems like the signal can get lost in all that noise, the recurrent nature of the selective State space. Mechanism. Oh, I see to kind of zero in and do the gradient on the signal when you have the signal and then of course there&amp;#039;s still a lot of noise but that maybe can get separated from the signal because of this bit by bit level processing and updating. I&amp;#039;m not 100 just confident in that theory but it is consistent with all the evidence that I know of so far. &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;[https://www.cognitiverevolution.ai/ Nick Labenz - The Cognitive Revolution]&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;youtube&amp;gt;dRxolamy-NA&amp;lt;/youtube&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;youtube&amp;gt;dRxolamy-NA&amp;lt;/youtube&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>BPeat</name></author>
		
	</entry>
	<entry>
		<id>https://primo.ai/index.php?title=State_Space_Model_(SSM)&amp;diff=37050&amp;oldid=prev</id>
		<title>BPeat at 00:07, 1 May 2024</title>
		<link rel="alternate" type="text/html" href="https://primo.ai/index.php?title=State_Space_Model_(SSM)&amp;diff=37050&amp;oldid=prev"/>
		<updated>2024-05-01T00:07:18Z</updated>

		<summary type="html">&lt;p&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 00:07, 1 May 2024&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l77&quot; &gt;Line 77:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 77:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;= Selective State Space vs transformer Models =&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;= Selective State Space vs transformer Models =&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;We just did an episode on the first 90 days of [[Mamba]] literature. And one of the things that is really interesting about this new mechanism, the selective state space mechanism. Is that it does have different strengths and weaknesses compared to the attention mechanism. Both in terms of how much memory it consumes where attention mechanism is quadratic but the length of the input and that might be, by the way, one of the reasons like just as you talk about like a torrent of data and a thousand samples per second, if that were to be naively translated to a thousand tokens per second. Then very quickly you&amp;#039;re getting to a level of tokens that we have only very recently reached with Frontier grade Transformers. It was only with gpt4 a year ago that the public first. Got to see a quality 8, 000 token Transformer. And before that it was like just a couple months where we had just seen the four thousand before that as of like 18 months ago, 2 000, Calcons was what you could really get from like the open AI API. So just the sheer volume of data may not limit itself super well to the Transformer but also another other When they break down these micro tasks. And look at what the Transformer can do and can&amp;#039;t do one of the things that really struggles on is the hyper noisy environment there. There was a interesting result in this one. Mamba versus Transformer comparison paper, it&amp;#039;s more about the state space mechanism and the attention mechanism. Those are really the two things that are more dueling it out than the higher level architectures. And they&amp;#039;re not even doing it out because they actually work best together spoiler but in the super noisy environment where what actually matters is quite rare in what you&amp;#039;re signaling, then the Transformer sometimes has a hard time converging and the intuition I&amp;#039;ve developed for that is because it&amp;#039;s changing all the weights at the same time across like the entire range. Range. Put it may be that the gradient is often dominated by noise and has a hard time converging on the signal. Whereas When I don&amp;#039;t want to make everything about the selective state space model, so I do have an obsession about this as folks know. It is updating per token and so it seems like it has a more natural mechanism when the actual signal hits to say. Oh, and this is where I start to violate my noanthropomorphizing policy but it has an ability to recognize When the signal hits and update in a more focused way on that one thing that really was supposed to matter, whereas the Transformer is updating everything all across, it&amp;#039;s considering everything at once. And so, it seems like the signal can get lost in all that noise, the recurrent nature of the selective State space. Mechanism. Oh, I see to kind of zero in and do the gradient on the signal when you have the signal and then of course there&amp;#039;s still a lot of noise but that maybe can get separated from the signal because of this bit by bit level processing and updating. I&amp;#039;m not 100 just confident in that theory but it is consistent with all the evidence that I know of so far.&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;We just did an episode on the first 90 days of [[Mamba]] literature. And one of the things that is really interesting about this new mechanism, the selective state space mechanism. Is that it does have different strengths and weaknesses compared to the attention mechanism. Both in terms of how much memory it consumes where attention mechanism is quadratic but the length of the input and that might be, by the way, one of the reasons like just as you talk about like a torrent of data and a thousand samples per second, if that were to be naively translated to a thousand tokens per second. Then very quickly you&amp;#039;re getting to a level of tokens that we have only very recently reached with Frontier grade Transformers. It was only with gpt4 a year ago that the public first. Got to see a quality 8, 000 token Transformer. And before that it was like just a couple months where we had just seen the four thousand before that as of like 18 months ago, 2 000, Calcons was what you could really get from like the open AI API. So just the sheer volume of data may not limit itself super well to the Transformer but also another other When they break down these micro tasks. And look at what the Transformer can do and can&amp;#039;t do one of the things that really struggles on is the hyper noisy environment there. There was a interesting result in this one. Mamba versus Transformer comparison paper, it&amp;#039;s more about the state space mechanism and the attention mechanism. Those are really the two things that are more dueling it out than the higher level architectures. And they&amp;#039;re not even doing it out because they actually work best together spoiler but in the super noisy environment where what actually matters is quite rare in what you&amp;#039;re signaling, then the Transformer sometimes has a hard time converging and the intuition I&amp;#039;ve developed for that is because it&amp;#039;s changing all the weights at the same time across like the entire range. Range. Put it may be that the gradient is often dominated by noise and has a hard time converging on the signal. Whereas When I don&amp;#039;t want to make everything about the selective state space model, so I do have an obsession about this as folks know. It is updating per token and so it seems like it has a more natural mechanism when the actual signal hits to say. Oh, and this is where I start to violate my noanthropomorphizing policy but it has an ability to recognize When the signal hits and update in a more focused way on that one thing that really was supposed to matter, whereas the Transformer is updating everything all across, it&amp;#039;s considering everything at once. And so, it seems like the signal can get lost in all that noise, the recurrent nature of the selective State space. Mechanism. Oh, I see to kind of zero in and do the gradient on the signal when you have the signal and then of course there&amp;#039;s still a lot of noise but that maybe can get separated from the signal because of this bit by bit level processing and updating. I&amp;#039;m not 100 just confident in that theory but it is consistent with all the evidence that I know of so far.&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt;&amp;#160;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt;&amp;#160;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;&amp;lt;youtube&amp;gt;dRxolamy-NA&amp;lt;/youtube&amp;gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>BPeat</name></author>
		
	</entry>
	<entry>
		<id>https://primo.ai/index.php?title=State_Space_Model_(SSM)&amp;diff=37047&amp;oldid=prev</id>
		<title>BPeat at 21:44, 30 April 2024</title>
		<link rel="alternate" type="text/html" href="https://primo.ai/index.php?title=State_Space_Model_(SSM)&amp;diff=37047&amp;oldid=prev"/>
		<updated>2024-04-30T21:44:42Z</updated>

		<summary type="html">&lt;p&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 21:44, 30 April 2024&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l73&quot; &gt;Line 73:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 73:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;youtube&amp;gt;GqwhkbrWDOI&amp;lt;/youtube&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;youtube&amp;gt;GqwhkbrWDOI&amp;lt;/youtube&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;youtube&amp;gt;dG6MSsdojLg&amp;lt;/youtube&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&amp;lt;youtube&amp;gt;dG6MSsdojLg&amp;lt;/youtube&amp;gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt;&amp;#160;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt;&amp;#160;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt;&amp;#160;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;= Selective State Space vs transformer Models =&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt;&amp;#160;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;We just did an episode on the first 90 days of [[Mamba]] literature. And one of the things that is really interesting about this new mechanism, the selective state space mechanism. Is that it does have different strengths and weaknesses compared to the attention mechanism. Both in terms of how much memory it consumes where attention mechanism is quadratic but the length of the input and that might be, by the way, one of the reasons like just as you talk about like a torrent of data and a thousand samples per second, if that were to be naively translated to a thousand tokens per second. Then very quickly you&amp;#039;re getting to a level of tokens that we have only very recently reached with Frontier grade Transformers. It was only with gpt4 a year ago that the public first. Got to see a quality 8, 000 token Transformer. And before that it was like just a couple months where we had just seen the four thousand before that as of like 18 months ago, 2 000, Calcons was what you could really get from like the open AI API. So just the sheer volume of data may not limit itself super well to the Transformer but also another other When they break down these micro tasks. And look at what the Transformer can do and can&amp;#039;t do one of the things that really struggles on is the hyper noisy environment there. There was a interesting result in this one. Mamba versus Transformer comparison paper, it&amp;#039;s more about the state space mechanism and the attention mechanism. Those are really the two things that are more dueling it out than the higher level architectures. And they&amp;#039;re not even doing it out because they actually work best together spoiler but in the super noisy environment where what actually matters is quite rare in what you&amp;#039;re signaling, then the Transformer sometimes has a hard time converging and the intuition I&amp;#039;ve developed for that is because it&amp;#039;s changing all the weights at the same time across like the entire range. Range. Put it may be that the gradient is often dominated by noise and has a hard time converging on the signal. Whereas When I don&amp;#039;t want to make everything about the selective state space model, so I do have an obsession about this as folks know. It is updating per token and so it seems like it has a more natural mechanism when the actual signal hits to say. Oh, and this is where I start to violate my noanthropomorphizing policy but it has an ability to recognize When the signal hits and update in a more focused way on that one thing that really was supposed to matter, whereas the Transformer is updating everything all across, it&amp;#039;s considering everything at once. And so, it seems like the signal can get lost in all that noise, the recurrent nature of the selective State space. Mechanism. Oh, I see to kind of zero in and do the gradient on the signal when you have the signal and then of course there&amp;#039;s still a lot of noise but that maybe can get separated from the signal because of this bit by bit level processing and updating. I&amp;#039;m not 100 just confident in that theory but it is consistent with all the evidence that I know of so far.&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>BPeat</name></author>
		
	</entry>
	<entry>
		<id>https://primo.ai/index.php?title=State_Space_Model_(SSM)&amp;diff=37017&amp;oldid=prev</id>
		<title>BPeat at 00:58, 29 April 2024</title>
		<link rel="alternate" type="text/html" href="https://primo.ai/index.php?title=State_Space_Model_(SSM)&amp;diff=37017&amp;oldid=prev"/>
		<updated>2024-04-29T00:58:44Z</updated>

		<summary type="html">&lt;p&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 00:58, 29 April 2024&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l21&quot; &gt;Line 21:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 21:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [[State Space Model (SSM)]] ... [[Mamba]] ... [[Sequence to Sequence (Seq2Seq)]] ... [[Recurrent Neural Network (RNN)]] ... [[(Deep) Convolutional Neural Network (DCNN/CNN)|Convolutional Neural Network (CNN)]]&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [[State Space Model (SSM)]] ... [[Mamba]] ... [[Sequence to Sequence (Seq2Seq)]] ... [[Recurrent Neural Network (RNN)]] ... [[(Deep) Convolutional Neural Network (DCNN/CNN)|Convolutional Neural Network (CNN)]]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;−&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [[Memory]]&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [[Memory&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;]] ... [[Memory Networks]] ... [[Hierarchical Temporal Memory (HTM)]] ... [[Lifelong Learning&lt;/ins&gt;]]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [[Mixture-of-Experts (MoE)]] ... [[Mistral]]&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [[Mixture-of-Experts (MoE)]] ... [[Mistral]]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://arxiv.org/abs/2312.00752 Mamba: Linear-Time Sequence Modeling with Selective State Spaces | Albert Gu, Tri Dao]&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://arxiv.org/abs/2312.00752 Mamba: Linear-Time Sequence Modeling with Selective State Spaces | Albert Gu, Tri Dao]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>BPeat</name></author>
		
	</entry>
	<entry>
		<id>https://primo.ai/index.php?title=State_Space_Model_(SSM)&amp;diff=37007&amp;oldid=prev</id>
		<title>BPeat at 00:48, 29 April 2024</title>
		<link rel="alternate" type="text/html" href="https://primo.ai/index.php?title=State_Space_Model_(SSM)&amp;diff=37007&amp;oldid=prev"/>
		<updated>2024-04-29T00:48:27Z</updated>

		<summary type="html">&lt;p&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 00:48, 29 April 2024&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l22&quot; &gt;Line 22:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 22:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [[State Space Model (SSM)]] ... [[Mamba]] ... [[Sequence to Sequence (Seq2Seq)]] ... [[Recurrent Neural Network (RNN)]] ... [[(Deep) Convolutional Neural Network (DCNN/CNN)|Convolutional Neural Network (CNN)]]&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [[State Space Model (SSM)]] ... [[Mamba]] ... [[Sequence to Sequence (Seq2Seq)]] ... [[Recurrent Neural Network (RNN)]] ... [[(Deep) Convolutional Neural Network (DCNN/CNN)|Convolutional Neural Network (CNN)]]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [[Memory]]&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [[Memory]]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt;&amp;#160;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;* [[Mixture-of-Experts (MoE)]] ... [[Mistral]]&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://arxiv.org/abs/2312.00752 Mamba: Linear-Time Sequence Modeling with Selective State Spaces | Albert Gu, Tri Dao]&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://arxiv.org/abs/2312.00752 Mamba: Linear-Time Sequence Modeling with Selective State Spaces | Albert Gu, Tri Dao]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [[Large Language Model (LLM)]] ... [[Large Language Model (LLM)#Multimodal|Multimodal]] ... [[Foundation Models (FM)]] ... [[Generative Pre-trained Transformer (GPT)|Generative Pre-trained]] ... [[Transformer]] ... [[GPT-4]] ... [[GPT-5]] ... [[Attention]] ... [[Generative Adversarial Network (GAN)|GAN]] ... [[Bidirectional Encoder Representations from Transformers (BERT)|BERT]]&lt;/div&gt;&lt;/td&gt;&lt;td class=&#039;diff-marker&#039;&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [[Large Language Model (LLM)]] ... [[Large Language Model (LLM)#Multimodal|Multimodal]] ... [[Foundation Models (FM)]] ... [[Generative Pre-trained Transformer (GPT)|Generative Pre-trained]] ... [[Transformer]] ... [[GPT-4]] ... [[GPT-5]] ... [[Attention]] ... [[Generative Adversarial Network (GAN)|GAN]] ... [[Bidirectional Encoder Representations from Transformers (BERT)|BERT]]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>BPeat</name></author>
		
	</entry>
</feed>