[{"data":1,"prerenderedAt":383},["ShallowReactive",2],{"$fgukOamtKU1RtUiMFsqdObttmqPPQz0uc7bl_gj_LyX0":3,"$fgrvIlJQk0lkC3ckUSivXNGwH2JsEMvHrSi5mRMjcEDs":245,"article-10":382},{"code":4,"msg":5,"data":6},0,"",{"category":7,"tag":11,"popular":19,"latest":86,"banner":126,"list":151,"cache":244},[8,9,10],"Agent","OpenAI","LLM",[8,12,13,14,9,10,15,16,17,18],"Google","Nvidia","Claude","DeepSeek","OCR","Chat","Generator",[20,29,37,45,54,62,70,79],{"id":21,"publish_date":22,"is_original":23,"collection":5,"cover_url":24,"cover_url_1_1":25,"title":26,"summary":27,"author":28},411,"2023-09-10",1,"article_res/cover/451ef50c225a8dc61c4336506794d13b.jpeg","article_res/cover/3ba9dc7a72f87d40b20fc2d225289ee3.jpeg","Idealism","Reality is created by the mind, we can change our reality by changing our mind. - Plato","Renee's Entrepreneurial Journey",{"id":30,"publish_date":31,"is_original":23,"collection":32,"cover_url":33,"cover_url_1_1":34,"title":35,"summary":36,"author":28},108,"2024-12-07","#LLM #AGI #AI Agent","article_res/cover/0039044422e4ec9f61c18e8ee1693bb0.jpeg","article_res/cover/4220971b108a91d21407d87bb02fbaa6.jpeg","Freysa.ai: The World's First Adversarial AI Agent Game","说服 Freysa 把钱包里的钱都拿出来",{"id":38,"publish_date":39,"is_original":23,"collection":40,"cover_url":41,"cover_url_1_1":42,"title":43,"summary":44,"author":28},12,"2025-03-09","#Oxford #Reasoning #LLM #Tool Use","article_res/cover/d448e9b3617a0b5302e1bd10c438bca9.jpeg","article_res/cover/864a468f9cc4c9317efadb3811909888.jpeg","Agentic Reasoning Framework - Significantly enhance the reasoning ability of LLMs through the integration of external tools using agents","Agentic Reasoning: Reasoning LLMs with Tools for Deep Research",{"id":46,"publish_date":47,"is_original":4,"collection":48,"cover_url":49,"cover_url_1_1":50,"title":51,"summary":52,"author":53},480,"2023-04-14","#Stable Diffusion","article_res/cover/0bdbe7cb1de4a78e54536e5d9afa7ec9.jpeg","article_res/cover/b3d6ffec0608dcfaf18c5a69906d1490.jpeg","【AIGC Learning】Generate Prompts Using Word Graphs - Stable Diffusion Web UI Series 13","AI will become a powerful tool in education, transforming the way we learn and deliver instruction.  \n- Reid Hoffman","--",{"id":55,"publish_date":56,"is_original":4,"collection":57,"cover_url":58,"cover_url_1_1":59,"title":60,"summary":61,"author":28},413,"2023-09-08","#Neuroscience","article_res/cover/74f8302d78a23d9430f22171eae136b6.jpeg","article_res/cover/87ca08af81bb304746be5261160964c0.jpeg","Can machines be conscious?","Do we have an ethical obligation to not turn off conscious machines? Would turning them off be murder? No. I don't lose any sleep over unplugging a conscious machine.\n- Jeff Hawkins, \"A Thousand Brains\"",{"id":63,"publish_date":64,"is_original":23,"collection":65,"cover_url":66,"cover_url_1_1":67,"title":68,"summary":69,"author":28},178,"2024-09-09","#Entrepreneurship","article_res/cover/a7224f025b55d1820408085faef63079.jpeg","article_res/cover/11a9995b096cbf64465ef01b8673b154.jpeg","37signals company","This damn sense of relaxation",{"id":71,"publish_date":72,"is_original":4,"collection":73,"cover_url":74,"cover_url_1_1":75,"title":76,"summary":77,"author":78},460,"2023-05-12","#Google","article_res/cover/b970687b12faa52da976f91248c2aa7b.jpeg","article_res/cover/d1e71b52cfd2c63bc6e71f3e85ff135c.jpeg","Learn what BRC-20 and Ordinals are using Google Bard","Ordinals - a new protocol that allows users to store arbitrary data on the Bitcoin blockchain","Google Bard mainly writes",{"id":80,"publish_date":81,"is_original":23,"collection":5,"cover_url":82,"cover_url_1_1":83,"title":84,"summary":85,"author":28},309,"2024-03-26","article_res/cover/9877f95894ee88532d0e6012c23a2df3.jpeg","article_res/cover/20092164ddc109ce6ae56b1984246751.jpeg","Learning the Cancun Upgrade with lepton and perplexity","Building a quick conversation-based search demo with Lepton AI.",[87,95,103,111,119],{"id":88,"publish_date":89,"is_original":23,"collection":90,"cover_url":91,"cover_url_1_1":92,"title":93,"summary":94,"author":28},627,"2025-03-20","#AI Avatar #AI Video Generation","article_res/cover/d95481358f73924989f8c4ee9c75d1c8.jpeg","article_res/cover/b74bc0fab01f8b6a6aa87696c0c3ed8b.jpeg","DisPose: Generating Animated Videos by Driving Video with Reference Images","DisPose is a controllable human image animation method that enhances video generation.",{"id":96,"publish_date":97,"is_original":23,"collection":98,"cover_url":99,"cover_url_1_1":100,"title":101,"summary":102,"author":28},626,"2025-03-21","#Deep Dive into LLMs #LLM #RL #Andrej Karpathy #AlphaGo","article_res/cover/446553a5c8f8f2f07d97b20eaee84e56.jpeg","article_res/cover/e6c2823409c9b34624064b9acbaca6f1.jpeg","AlphaGo and the Power of Reinforcement Learning - Andrej Karpathy's Deep Dive on LLMs (Part 9)","Simply learning from humans will never surpass human capabilities.",{"id":104,"publish_date":105,"is_original":23,"collection":106,"cover_url":107,"cover_url_1_1":108,"title":109,"summary":110,"author":28},625,"2025-03-22","#Deep Dive into LLMs #LLM #RL #RLHF #Andrej Karpathy","article_res/cover/8da81d38b1e5cf558a164710fd8a5389.jpeg","article_res/cover/96f028d76c362a99a0dd56389e8f7a9b.jpeg","Reinforcement Learning from Human Feedback (RLHF) - Andrej Karpathy's Deep Dive on LLMs (Part 10)","Fine-Tuning Language Models from Human Preferences",{"id":112,"publish_date":113,"is_original":23,"collection":114,"cover_url":115,"cover_url_1_1":116,"title":117,"summary":118,"author":28},624,"2025-03-23","#Deep Dive into LLMs #LLM #Andrej Karpathy #AI Agent #MMM","article_res/cover/a5e7c3d48bb09109684d6513287c661d.jpeg","article_res/cover/d3f22b7c0ab8d82fd2da457a299e0773.jpeg","The Future of Large Language Models - Andrej Karpathy's In-Depth Explanation of LLM (Part 11)","preview of things to come",{"id":120,"publish_date":113,"is_original":23,"collection":121,"cover_url":122,"cover_url_1_1":123,"title":124,"summary":125,"author":28},623,"#Google #Voe #AI Video Generation","article_res/cover/c44062fea0f336c2b96b3928292392c2.jpeg","article_res/cover/a041041c69092ad3db191c5bf3ff981b.jpeg","Trial of Google's video generation model VOE2","Our state-of-the-art video generation model",[127,135,143],{"id":128,"publish_date":129,"is_original":23,"collection":130,"cover_url":131,"cover_url_1_1":132,"title":133,"summary":134,"author":28},300,"2024-04-16","#AI in Science #AGI","article_res/cover/6bf01e793e0f33e848572412eebdf9b0.jpeg","article_res/cover/91a5ee21dafecb914fabeb9430d46ec1.jpeg","Would Einstein lose his job - AI and Quantum Computing: A Glimpse into the Near Future","So Einstein's job is still safe.",{"id":136,"publish_date":137,"is_original":23,"collection":138,"cover_url":139,"cover_url_1_1":140,"title":141,"summary":142,"author":28},101,"2024-12-14","#Nvidia #AI 3D Generator","article_res/cover/693e07c85980c5c0c8fde3f037733f23.jpeg","article_res/cover/9ea8edff2d5d303ff3fffff3f6f9c3d9.jpeg","NVIDIA's open-source 3D project LLaMA-Mesh","LLaMA-Mesh: Unifying 3D Mesh Generation with Language Models",{"id":144,"publish_date":145,"is_original":23,"collection":146,"cover_url":147,"cover_url_1_1":148,"title":149,"summary":150,"author":28},131,"2024-11-10","#OpenAI","article_res/cover/87f8ed353ce39f31960e7cdfaf075a35.jpeg","article_res/cover/f597a63935f5cd32e484b4aadd6019e8.jpeg","ChatGPT has launched the Search function","Get fast, timely answers with links to relevant web sources.",{"big":152,"small":214},[153,181],{"title":154,"list":155},"AGENT",[156,157,165,173],{"id":112,"publish_date":113,"is_original":23,"collection":114,"cover_url":115,"cover_url_1_1":116,"title":117,"summary":118,"author":28},{"id":158,"publish_date":159,"is_original":23,"collection":160,"cover_url":161,"cover_url_1_1":162,"title":163,"summary":164,"author":28},622,"2025-03-24","#OWL #AI Agent #MAS #MCP #CUA","article_res/cover/cb50ca7f2bf4d1ed50202d7406e1c19a.jpeg","article_res/cover/4aa7aa3badfacf3cc84121334f1050dd.jpeg","OWL: Multi-agent collaboration","OWL: Optimized Workforce Learning for General Multi-Agent Assistance in Real-World Task Automation",{"id":166,"publish_date":167,"is_original":23,"collection":168,"cover_url":169,"cover_url_1_1":170,"title":171,"summary":172,"author":28},620,"2025-03-26","#LLM #Google #Gemini #AI Agent","article_res/cover/53751a6dbbe990b1eb0b63f3b062aed4.jpeg","article_res/cover/031344981f0a212ff82d1f3a64aa5756.jpeg","Gemini 2.5 Pro, claimed to be far ahead of the competition, has been released with great fanfare: comprehensively surpassing other LLMs and topping the global rankings","Gemini 2.5: Our most intelligent AI model",{"id":174,"publish_date":175,"is_original":23,"collection":176,"cover_url":177,"cover_url_1_1":178,"title":179,"summary":180,"author":28},616,"2025-03-29","#MAS #AI Agent #AI Coder #MetaGPT #MGX","article_res/cover/9dcd702ad2035902e5e77967c34a1f1e.jpeg","article_res/cover/0a97fc4a922753c8f46ff38792020df8.jpeg","MGX - An automated website-building platform composed of multiple AI Agents","Your 24/7 AI Team | Dream, Chat, Create.",{"title":182,"list":183},"OPENAI",[184,191,199,206],{"id":185,"publish_date":167,"is_original":23,"collection":186,"cover_url":187,"cover_url_1_1":188,"title":189,"summary":190,"author":28},619,"#OpenAI #AI Image Generator #4o #MMM #AR Transformer","article_res/cover/2faffc97fcecf3151552cb0fd3206d89.jpeg","article_res/cover/1133cb4948af44cee2e7fbe79efb69e5.jpeg","The native image function of GPT-4o is officially launched","Introducing 4o Image Generation",{"id":192,"publish_date":193,"is_original":4,"collection":194,"cover_url":195,"cover_url_1_1":196,"title":197,"summary":198,"author":28},434,"2023-07-15","#Anthropic #OpenAI #Google #AI Code Generator #Claude","article_res/cover/e1b6f600a2b9f262a4392684e5f2ce25.jpeg","article_res/cover/6e1772e83f78f9a351ab23d3e414adee.jpeg","Latest Updates on Google Bard /Anthropic Claude2 / ChatGPT Code Interpreter","We want our models to use their programming skills to provide more natural interfaces to the basic functions of our computers.  \n - OpenAI",{"id":200,"publish_date":201,"is_original":4,"collection":146,"cover_url":202,"cover_url_1_1":203,"title":204,"summary":205,"author":28},417,"2023-08-24","article_res/cover/bccf897d50a88b18364e35f7466387e0.jpeg","article_res/cover/2f871085c1073717c1703ae86e18056f.jpeg","The GPT-3.5 Turbo fine-tuning (fine-tuning function) has been released～","Developers can now bring their own data to customize GPT-3.5 Turbo for their use cases.",{"id":207,"publish_date":208,"is_original":4,"collection":209,"cover_url":210,"cover_url_1_1":211,"title":212,"summary":213,"author":28},407,"2023-09-22","#OpenAI #AI Image Generator","article_res/cover/c59005e903d35cfc32346e2756e2728a.jpeg","article_res/cover/ba011d265e6d84b5c8cb6fd6b757b6cc.jpeg","Dall-E 3","DALL·E 3 understands significantly more nuance and detail, allowing you to easily translate your ideas into images.",[215,221,241],{"title":10,"list":216},[217,218,219,220],{"id":96,"publish_date":97,"is_original":23,"collection":98,"cover_url":99,"cover_url_1_1":100,"title":101,"summary":102,"author":28},{"id":104,"publish_date":105,"is_original":23,"collection":106,"cover_url":107,"cover_url_1_1":108,"title":109,"summary":110,"author":28},{"id":112,"publish_date":113,"is_original":23,"collection":114,"cover_url":115,"cover_url_1_1":116,"title":117,"summary":118,"author":28},{"id":166,"publish_date":167,"is_original":23,"collection":168,"cover_url":169,"cover_url_1_1":170,"title":171,"summary":172,"author":28},{"title":222,"list":223},"GOOGLE",[224,225,226,234],{"id":120,"publish_date":113,"is_original":23,"collection":121,"cover_url":122,"cover_url_1_1":123,"title":124,"summary":125,"author":28},{"id":166,"publish_date":167,"is_original":23,"collection":168,"cover_url":169,"cover_url_1_1":170,"title":171,"summary":172,"author":28},{"id":227,"publish_date":228,"is_original":23,"collection":229,"cover_url":230,"cover_url_1_1":231,"title":232,"summary":233,"author":28},615,"2025-03-30","#AI Researcher #AI Science #HKU #Google #AI Agent","article_res/cover/21fadf906067714bb0db31ae13a77c15.jpeg","article_res/cover/2697999a72bd26b22e85f0e92936d3ed.jpeg","AI-Researcher: LLM-driven全自动 scientific research assistant","AI-Researcher: Fully-Automated Scientific Discovery with LLM Agents  \nOpen-Sourced Alternative to Google AI Co-Scientist",{"id":235,"publish_date":236,"is_original":23,"collection":73,"cover_url":237,"cover_url_1_1":238,"title":239,"summary":240,"author":28},463,"2023-05-09","article_res/cover/89800f207723acdb55fc53bf999ebdc9.jpeg","article_res/cover/5764f369b4accd8f83e94aa4c077a175.jpeg","The Smallville sandbox world - A town with 25 virtual residents","Believable proxies of human behavior can empower interactive apps: Immersive environment, Rehearsal space, Prototyping tool",{"title":242,"list":243},"NVIDIA",[],true,{"code":4,"msg":5,"data":246},{"id":247,"publish_date":248,"is_original":23,"collection":249,"articles_id":250,"cover_url":251,"cover_url_1_1":252,"title":253,"summary":254,"author":28,"content":255,"popular":256,"list":323,"category":380,"tag":381},10,"2025-03-11","#AI Voice Generator #AI Speech Generation #Sesame #TTS #CSM","9PB1bHK_Q6eN7DRl8n9fFg","article_res/cover/8022c1f35a13c9ddd348b4dcb307c428.jpeg","article_res/cover/757a7cd07dff517d86ce882b0654f953.jpeg","Sesame CSM - An incredibly powerful speech model, soon to be open-sourced~","A Conversational Speech Generation Model","\u003Cdiv class=\"rich_media_content js_underline_content\n                       autoTypeSetting24psection\n            \" id=\"js_content\">\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">I recently came across a very impressive voice model that is about to be open-sourced. I'd like to share its demo with you: \"Crossing the uncanny valley of conversational voice,\" which will be released on February 27, 2025, by Brendan Iribe, Ankit Kumar, and the Sesame team.\u003C/span>\u003Cspan leaf=\"\">https://github.com/SesameAILabs/csm\u003C/span>\u003C/p>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">First, listen to a conversation between me and it (the one with a bit of an accent and not so pleasant voice is me):\u003C/span>\u003C/p>\u003Csection nodeleaf=\"\">\u003Cdiv style=\"height: 508px; background: rgb(0, 0, 0); border-radius: 4px; overflow: hidden; margin-bottom: 12px;\">\u003Cvideo src=\"./assets/17423769654250.6369122776169238.mp4\" poster=\"./assets/17423769654150.8633829671928457.jpeg\" controls=\"\" style=\"width: 100%;height: 100%;\">\u003C/video>\u003C/div>\u003C/section>\u003Ch3 style='box-sizing: border-box;margin: 30px 0px 15px;color: rgba(0, 0, 0, 0.85);font-weight: 500;cursor: pointer;padding: 0px;display: block;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan style=\"box-sizing: border-box;cursor: pointer;font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;text-align: left;font-weight: bold;display: block;\">\u003Cspan leaf=\"\">Achieving \"Voice Presence\"\u003C/span>\u003C/span>\u003C/h3>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">How do we determine if someone truly understands us? This is not only reflected in language but also in the subtle nuances of voice: the rising intonation when excited, the appropriate pauses during deep thought, and the warm comfort conveyed.\u003C/span>\u003C/p>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">Voice is the most intimate medium of human communication, conveying rich and delicate meanings through countless subtle variations in tone, pitch, rhythm, and emotion.\u003C/span>\u003C/p>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">However, existing digital voice assistants lack this crucial quality, making it difficult for them to truly integrate into our daily lives. After the initial novelty wears off, the monotonous and neutral voice gradually becomes tiring.\u003C/span>\u003C/p>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">The goal of the Sesame team is to achieve true \"Voice Presence\" — making voice interactions feel real, understanding, and valuable. They are creating not just a tool for processing commands, but a partner capable of engaging in genuine conversations, building trust through continuous interaction, and thus fully unlocking the enormous potential of voice as a human-computer interaction medium.\u003C/span>\u003C/p>\u003Ch3 style='box-sizing: border-box;margin: 30px 0px 15px;color: rgba(0, 0, 0, 0.85);font-weight: 500;cursor: pointer;padding: 0px;display: block;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan style=\"box-sizing: border-box;cursor: pointer;font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;text-align: left;font-weight: bold;display: block;\">\u003Cspan leaf=\"\">Key elements for achieving Voice Presence:\u003C/span>\u003C/span>\u003C/h3>\u003Cul style='box-sizing: border-box;margin: 8px 0px;cursor: pointer;list-style-type: disc;padding: 0px 0px 0px 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;' class=\"list-paddingleft-1\">\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cstrong style=\"box-sizing: border-box;font-weight: bold;cursor: pointer;color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;margin: 0px;padding: 0px;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003Cspan leaf=\"\">Emotional Intelligence\u003C/span>\u003C/strong>\u003Cspan leaf=\"\">: Understanding and responding to emotions in the conversation;\u003C/span>\u003C/section>\u003C/li>\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cstrong style=\"box-sizing: border-box;font-weight: bold;cursor: pointer;color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;margin: 0px;padding: 0px;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003Cspan leaf=\"\">Dialogue Dynamics\u003C/span>\u003C/strong>\u003Cspan leaf=\"\">: Natural pacing, appropriate pauses, interruptions, and emphasis;\u003C/span>\u003C/section>\u003C/li>\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cstrong style=\"box-sizing: border-box;font-weight: bold;cursor: pointer;color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;margin: 0px;padding: 0px;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003Cspan leaf=\"\">Context Awareness\u003C/span>\u003C/strong>\u003Cspan leaf=\"\">: Adjusting tone and style according to specific contexts;\u003C/span>\u003C/section>\u003C/li>\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cstrong style=\"box-sizing: border-box;font-weight: bold;cursor: pointer;color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;margin: 0px;padding: 0px;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003Cspan leaf=\"\">Consistent Personality Traits\u003C/span>\u003C/strong>\u003Cspan leaf=\"\">: Maintaining a coherent, reliable, and appropriate expression of character.\u003C/span>\u003C/section>\u003C/li>\u003C/ul>\u003Ch3 style='box-sizing: border-box;margin: 30px 0px 15px;color: rgba(0, 0, 0, 0.85);font-weight: 500;cursor: pointer;padding: 0px;display: block;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan style=\"box-sizing: border-box;cursor: pointer;font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;text-align: left;font-weight: bold;display: block;\">\u003Cspan leaf=\"\">CSM New Voice Model\u003C/span>\u003C/span>\u003C/h3>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">The Sesame team has proposed\u003C/span>\u003Cstrong style=\"box-sizing: border-box;font-weight: bold;cursor: pointer;color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;margin: 0px;padding: 0px;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003Cspan leaf=\"\">Conversational Speech Model（CSM）\u003C/span>\u003C/strong>\u003Cspan leaf=\"\">a new voice model using an end-to-end multimodal learning framework based on Transformers. The key innovations include:\u003C/span>\u003C/p>\u003Cul style='box-sizing: border-box;margin: 8px 0px;cursor: pointer;list-style-type: disc;padding: 0px 0px 0px 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;' class=\"list-paddingleft-1\">\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cspan leaf=\"\">End-to-end voice interaction learning using a multimodal Transformer architecture;\u003C/span>\u003C/section>\u003C/li>\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cspan leaf=\"\">Incorporating multiple dimensions such as language and prosody for voice inference;\u003C/span>\u003C/section>\u003C/li>\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cspan leaf=\"\">Going beyond the limitations of traditional public evaluation datasets and adopting more rigorous assessment methods to continuously improve model performance.\u003C/span>\u003C/section>\u003C/li>\u003C/ul>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">Through CSM, the team has taken a significant step towards achieving \"Voice Presence,\" transforming AI voice from a monotonous command responder to a truly interactive, emotionally and contextually aware conversational partner.\u003C/span>\u003C/p>\u003Ch3 style='box-sizing: border-box;margin: 30px 0px 15px;color: rgba(0, 0, 0, 0.85);font-weight: 500;cursor: pointer;padding: 0px;display: block;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan style=\"box-sizing: border-box;cursor: pointer;font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;text-align: left;font-weight: bold;display: block;\">\u003Cspan leaf=\"\">Detailed Model Architecture\u003C/span>\u003C/span>\u003C/h3>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cstrong style=\"box-sizing: border-box;font-weight: bold;cursor: pointer;color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;margin: 0px;padding: 0px;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003Cspan leaf=\"\">Conversational Speech Model（CSM）\u003C/span>\u003C/strong>\u003Cspan leaf=\"\">is a multimodal voice model implemented using two autoregressive Transformers based on the RQ-Transformer approach. Unlike previous methods, the Sesame team splits the Transformer at the zeroth codebook layer.\u003C/span>\u003C/p>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">The first multimodal backbone network processes interleaved text (Text) and audio (Audio) sequences to predict the zeroth codebook. The second audio decoder then generates the audio information for layers 1 to N-1 based on the predicted zeroth codebook.\u003C/span>\u003C/p>\u003Ch3 style='box-sizing: border-box;margin: 30px 0px 15px;color: rgba(0, 0, 0, 0.85);font-weight: 500;cursor: pointer;padding: 0px;display: block;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan style=\"box-sizing: border-box;cursor: pointer;font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;text-align: left;font-weight: bold;display: block;\">\u003Cspan leaf=\"\">Specific Model Operation Mechanism\u003C/span>\u003C/span>\u003C/h3>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">In practice, tokens of text (T) and audio (A) are input alternately into the Backbone network, which predicts the content of the zeroth codebook. Subsequently, the Decoder, based on the predicted zeroth codebook, autoregressively samples and generates the content of codebooks from layer 1 to N-1, ultimately reconstructing the audio.\u003C/span>\u003C/p>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">The team used two Tokenizers:\u003C/span>\u003C/p>\u003Cul style='box-sizing: border-box;margin: 8px 0px;cursor: pointer;list-style-type: disc;padding: 0px 0px 0px 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;' class=\"list-paddingleft-1\">\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cstrong style=\"box-sizing: border-box;font-weight: bold;cursor: pointer;color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;margin: 0px;padding: 0px;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003Cspan leaf=\"\">Text Tokenizer\u003C/span>\u003C/strong>\u003Cspan leaf=\"\">: Using the Llama tokenizer[6] to generate text tokens;\u003C/span>\u003C/section>\u003C/li>\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cstrong style=\"box-sizing: border-box;font-weight: bold;cursor: pointer;color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;margin: 0px;padding: 0px;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003Cspan leaf=\"\">Audio Tokenizer\u003C/span>\u003C/strong>\u003Cspan leaf=\"\">: Using Mimi (a split-RVQ Tokenizer) to produce one semantic codebook and N-1 acoustic codebooks per frame at a frequency of 12.5Hz.\u003C/span>\u003C/section>\u003C/li>\u003C/ul>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">The structure of the training data samples is an alternating pattern of text and audio.\u003C/span>\u003C/p>\u003Ch3 style='box-sizing: border-box;margin: 30px 0px 15px;color: rgba(0, 0, 0, 0.85);font-weight: 500;cursor: pointer;padding: 0px;display: block;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan style=\"box-sizing: border-box;cursor: pointer;font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;text-align: left;font-weight: bold;display: block;\">\u003Cspan leaf=\"\">Compute Amortization Scheme\u003C/span>\u003C/span>\u003C/h3>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">Due to the high memory requirements of the model, even with a smaller model size, training speed can be significantly slowed down, affecting scalability and experimental speed. To address this, the team adopted\u003C/span>\u003Cstrong style=\"box-sizing: border-box;font-weight: bold;cursor: pointer;color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;margin: 0px;padding: 0px;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003Cspan leaf=\"\">a compute amortization scheme\u003C/span>\u003C/strong>\u003Cspan leaf=\"\">to alleviate memory bottlenecks:\u003C/span>\u003C/p>\u003Cul style='box-sizing: border-box;margin: 8px 0px;cursor: pointer;list-style-type: disc;padding: 0px 0px 0px 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;' class=\"list-paddingleft-1\">\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cspan leaf=\"\">The Backbone Transformer predicts the zeroth codebook over all audio frames;\u003C/span>\u003C/section>\u003C/li>\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cspan leaf=\"\">The Decoder only predicts the remaining codebooks from layer 1 to N-1, but only for randomly selected 1/16 of the frames, significantly reducing the memory consumption required for training and alleviating the bottleneck during model scaling.\u003C/span>\u003C/section>\u003C/li>\u003C/ul>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">Specifically:\u003C/span>\u003C/p>\u003Cul style='box-sizing: border-box;margin: 8px 0px;cursor: pointer;list-style-type: disc;padding: 0px 0px 0px 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;' class=\"list-paddingleft-1\">\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cspan leaf=\"\">The Backbone models the zeroth codebook across all frames (highlighted in blue).\u003C/span>\u003C/section>\u003C/li>\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cspan leaf=\"\">The Decoder only predicts the remaining codebook content and calculates loss for randomly sampled 1/16 of the frames (marked in green).\u003C/span>\u003C/section>\u003C/li>\u003C/ul>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">This method retains the integrity and fidelity of the RVQ codebooks while alleviating memory constraints, improving training speed and scalability, and facilitating faster experimental iterations.\u003C/span>\u003C/p>\u003Ch3 style='box-sizing: border-box;margin: 30px 0px 15px;color: rgba(0, 0, 0, 0.85);font-weight: 500;cursor: pointer;padding: 0px;display: block;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan style=\"box-sizing: border-box;cursor: pointer;font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;text-align: left;font-weight: bold;display: block;\">\u003Cspan leaf=\"\">Model Scale and Training Details\u003C/span>\u003C/span>\u003C/h3>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">Three model sizes were designed:\u003C/span>\u003C/p>\u003Cul style='box-sizing: border-box;margin: 8px 0px;cursor: pointer;list-style-type: disc;padding: 0px 0px 0px 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;' class=\"list-paddingleft-1\">\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cstrong style=\"box-sizing: border-box;font-weight: bold;cursor: pointer;color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;margin: 0px;padding: 0px;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003Cspan leaf=\"\">Small\u003C/span>\u003C/strong>\u003Cspan leaf=\"\">: Backbone 3B parameters, Decoder 250M parameters\u003C/span>\u003C/section>\u003C/li>\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cstrong style=\"box-sizing: border-box;font-weight: bold;cursor: pointer;color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;margin: 0px;padding: 0px;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003Cspan leaf=\"\">Medium\u003C/span>\u003C/strong>\u003Cspan leaf=\"\">: Backbone 8B parameters, Decoder 300M parameters\u003C/span>\u003C/section>\u003C/li>\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cstrong style=\"box-sizing: border-box;font-weight: bold;cursor: pointer;color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;margin: 0px;padding: 0px;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003Cspan leaf=\"\">Tiny\u003C/span>\u003C/strong>\u003Cspan leaf=\"\">: Backbone 1B parameters, Decoder 100M parameters\u003C/span>\u003C/section>\u003C/li>\u003C/ul>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">All models were trained with a sequence length of 2048 (corresponding to approximately 2 minutes of audio content), and each model was trained for five epochs.\u003C/span>\u003C/p>\u003Ch3 style='box-sizing: border-box;margin: 30px 0px 15px;color: rgba(0, 0, 0, 0.85);font-weight: 500;cursor: pointer;padding: 0px;display: block;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan style=\"box-sizing: border-box;cursor: pointer;font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;text-align: left;font-weight: bold;display: block;\">\u003Cspan leaf=\"\">Sample Demonstrations\u003C/span>\u003C/span>\u003C/h3>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cstrong style=\"box-sizing: border-box;font-weight: bold;cursor: pointer;color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;margin: 0px;padding: 0px;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003Cspan leaf=\"\">Paralinguistics\u003C/span>\u003C/strong>\u003C/p>\u003Cul style='box-sizing: border-box;margin: 8px 0px;cursor: pointer;list-style-type: disc;padding: 0px 0px 0px 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;' class=\"list-paddingleft-1\">\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cp style=\"box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;text-indent: 0em;padding: 8px 0px;\">\u003Cspan leaf=\"\">Sample Audio 1\u003C/span>\u003C/p>\u003Csection nodeleaf=\"\">\u003Cmp-common-mpaudio class=\"js_editor_audio res_iframe js_uneditable custom_select_card\" data-pluginname=\"insertaudio\" name=\"sesame_paralinguistics_01\" author=\"Renee 创业随笔\" src=\"./assets/17423769624110.09095827153804081.mp3\" isaac2=\"1\" low_size=\"13.66\" source_size=\"13.7\" high_size=\"29.67\" play_length=\"7000\" data-trans_state=\"1\" data-verify_state=\"3\" voice_encode_fileid=\"MzkwOTMzMzk0MV8yMjQ3NDk0NDcy\" cover=\"./assets/17423769624110.09095827153804081.jpeg\" data-uuid=\"17423769624110.09095827153804081\">\u003C/mp-common-mpaudio>\u003C/section>\u003C/section>\u003C/li>\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cp style=\"box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;text-indent: 0em;padding: 8px 0px;\">\u003Cspan leaf=\"\">Sample Audio 2\u003C/span>\u003C/p>\u003Csection nodeleaf=\"\">\u003Cmp-common-mpaudio class=\"js_editor_audio res_iframe js_uneditable custom_select_card\" data-pluginname=\"insertaudio\" name=\"sesame_paralinguistics_02\" author=\"Renee 创业随笔\" src=\"./assets/17423769624110.7984452525844936.mp3\" isaac2=\"1\" low_size=\"5.98\" source_size=\"6\" high_size=\"13.08\" play_length=\"3000\" data-trans_state=\"1\" data-verify_state=\"3\" voice_encode_fileid=\"MzkwOTMzMzk0MV8yMjQ3NDk0NDcz\" cover=\"./assets/17423769624110.7984452525844936.jpeg\" data-uuid=\"17423769624110.7984452525844936\">\u003C/mp-common-mpaudio>\u003C/section>\u003C/section>\u003C/li>\u003C/ul>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cstrong style=\"box-sizing: border-box;font-weight: bold;cursor: pointer;color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;margin: 0px;padding: 0px;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003Cspan leaf=\"\">Foreign Language\u003C/span>\u003C/strong>\u003C/p>\u003Cul style='box-sizing: border-box;margin: 8px 0px;cursor: pointer;list-style-type: disc;padding: 0px 0px 0px 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;' class=\"list-paddingleft-1\">\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cspan leaf=\"\">Sample Audio 1\u003C/span>\u003C/section>\u003Csection nodeleaf=\"\">\u003Cmp-common-mpaudio class=\"js_editor_audio res_iframe js_uneditable custom_select_card\" data-pluginname=\"insertaudio\" name=\"sesame_foreign_words_01\" author=\"Renee 创业随笔\" src=\"./assets/17423769624110.8269218246544576.mp3\" isaac2=\"1\" low_size=\"17.69\" source_size=\"17.7\" high_size=\"38.11\" play_length=\"9000\" data-trans_state=\"1\" data-verify_state=\"3\" voice_encode_fileid=\"MzkwOTMzMzk0MV8yMjQ3NDk0NDcx\" cover=\"./assets/17423769624110.8269218246544576.jpeg\" data-uuid=\"17423769624110.8269218246544576\">\u003C/mp-common-mpaudio>\u003C/section>\u003C/li>\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cspan leaf=\"\">Sample Audio 2\u003C/span>\u003C/section>\u003Csection nodeleaf=\"\">\u003Cmp-common-mpaudio class=\"js_editor_audio res_iframe js_uneditable custom_select_card\" data-pluginname=\"insertaudio\" name=\"sesame_foreign_words_02\" author=\"Renee 创业随笔\" src=\"./assets/17423769624110.6779095765027339.mp3\" isaac2=\"1\" low_size=\"12.5\" source_size=\"12.5\" high_size=\"25.54\" play_length=\"6000\" data-trans_state=\"1\" data-verify_state=\"3\" voice_encode_fileid=\"MzkwOTMzMzk0MV8yMjQ3NDk0NDc0\" cover=\"./assets/17423769624110.6779095765027339.jpeg\" data-uuid=\"17423769624110.6779095765027339\">\u003C/mp-common-mpaudio>\u003C/section>\u003C/li>\u003C/ul>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cstrong style=\"box-sizing: border-box;font-weight: bold;cursor: pointer;color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;margin: 0px;padding: 0px;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003Cspan leaf=\"\">Contextual Expressivity\u003C/span>\u003C/strong>\u003C/p>\u003Cul style='box-sizing: border-box;margin: 8px 0px;cursor: pointer;list-style-type: disc;padding: 0px 0px 0px 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;' class=\"list-paddingleft-1\">\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cspan leaf=\"\">Sample Audio 1\u003C/span>\u003C/section>\u003Csection nodeleaf=\"\">\u003Cmp-common-mpaudio class=\"js_editor_audio res_iframe js_uneditable custom_select_card\" data-pluginname=\"insertaudio\" name=\"sesame_contextual_expressive_01\" author=\"Renee 创业随笔\" src=\"./assets/17423769624110.32842721518258644.mp3\" isaac2=\"1\" low_size=\"41.17\" source_size=\"41.2\" high_size=\"175.14\" play_length=\"22000\" data-trans_state=\"1\" data-verify_state=\"3\" voice_encode_fileid=\"MzkwOTMzMzk0MV8yMjQ3NDk0NDc1\" cover=\"./assets/17423769624110.32842721518258644.jpeg\" data-uuid=\"17423769624110.32842721518258644\">\u003C/mp-common-mpaudio>\u003C/section>\u003C/li>\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cspan leaf=\"\">Sample Audio 2\u003C/span>\u003C/section>\u003Csection nodeleaf=\"\">\u003Cmp-common-mpaudio class=\"js_editor_audio res_iframe js_uneditable custom_select_card\" data-pluginname=\"insertaudio\" name=\"sesame_contextual_expressive_02\" author=\"Renee 创业随笔\" src=\"./assets/17423769624110.735902580426834.mp3\" isaac2=\"1\" low_size=\"42.58\" source_size=\"42.6\" high_size=\"172.51\" play_length=\"22000\" data-trans_state=\"1\" data-verify_state=\"3\" voice_encode_fileid=\"MzkwOTMzMzk0MV8yMjQ3NDk0NDc2\" cover=\"./assets/17423769624110.735902580426834.jpeg\" data-uuid=\"17423769624110.735902580426834\">\u003C/mp-common-mpaudio>\u003C/section>\u003C/li>\u003C/ul>\u003Cblockquote style='box-sizing: border-box;margin: 20px 0px;cursor: pointer;padding: 10px 10px 10px 20px;border-style: none none none solid;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0.05);width: auto;height: auto;box-shadow: rgba(0, 0, 0, 0) 0px 0px 0px 0px;display: block;overflow: auto;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cp style=\"box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;text-indent: 0em;padding: 8px 0px;font-weight: normal;\">\u003Cspan leaf=\"\">Note: The model demonstrates how it precisely adjusts intonation, speech rate, and emotional expression based on context, making the voice content more realistic.\u003C/span>\u003C/p>\u003C/blockquote>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cstrong style=\"box-sizing: border-box;font-weight: bold;cursor: pointer;color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;margin: 0px;padding: 0px;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003Cspan leaf=\"\">Pronunciation Correction Examples\u003C/span>\u003C/strong>\u003C/p>\u003Cul style='box-sizing: border-box;margin: 8px 0px;cursor: pointer;list-style-type: disc;padding: 0px 0px 0px 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;' class=\"list-paddingleft-1\">\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cspan leaf=\"\">Sample Audio 1\u003C/span>\u003C/section>\u003Csection nodeleaf=\"\">\u003Cmp-common-mpaudio class=\"js_editor_audio res_iframe js_uneditable custom_select_card\" data-pluginname=\"insertaudio\" name=\"sesame_pronunciation_correction_01\" author=\"Renee 创业随笔\" src=\"./assets/17423769624110.5764455180147381.mp3\" isaac2=\"1\" low_size=\"12.89\" source_size=\"12.9\" high_size=\"52.32\" play_length=\"6000\" data-trans_state=\"1\" data-verify_state=\"3\" voice_encode_fileid=\"MzkwOTMzMzk0MV8yMjQ3NDk0NDc3\" cover=\"./assets/17423769624110.5764455180147381.jpeg\" data-uuid=\"17423769624110.5764455180147381\">\u003C/mp-common-mpaudio>\u003C/section>\u003C/li>\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cspan leaf=\"\">Sample Audio 2\u003C/span>\u003C/section>\u003Csection nodeleaf=\"\">\u003Cmp-common-mpaudio class=\"js_editor_audio res_iframe js_uneditable custom_select_card\" data-pluginname=\"insertaudio\" name=\"sesame_pronunciation_correction_02\" author=\"Renee 创业随笔\" src=\"./assets/17423769624110.9785946409406849.mp3\" isaac2=\"1\" low_size=\"13.51\" source_size=\"13.5\" high_size=\"59.17\" play_length=\"7000\" data-trans_state=\"1\" data-verify_state=\"3\" voice_encode_fileid=\"MzkwOTMzMzk0MV8yMjQ3NDk0NDc4\" cover=\"./assets/17423769624110.9785946409406849.jpeg\" data-uuid=\"17423769624110.9785946409406849\">\u003C/mp-common-mpaudio>\u003C/section>\u003C/li>\u003C/ul>\u003Cblockquote style='box-sizing: border-box;margin: 20px 0px;cursor: pointer;padding: 10px 10px 10px 20px;border-style: none none none solid;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0.05);width: auto;height: auto;box-shadow: rgba(0, 0, 0, 0) 0px 0px 0px 0px;display: block;overflow: auto;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cp style=\"box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;text-indent: 0em;padding: 8px 0px;font-weight: normal;\">\u003Cspan leaf=\"\">Note: The pronunciation correction examples are real recordings, while the rest of the audio is generated by the model.\u003C/span>\u003C/p>\u003C/blockquote>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cstrong style=\"box-sizing: border-box;font-weight: bold;cursor: pointer;color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;margin: 0px;padding: 0px;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003Cspan leaf=\"\">Multiple Speakers Example\u003C/span>\u003C/strong>\u003C/p>\u003Cul style='box-sizing: border-box;margin: 8px 0px;cursor: pointer;list-style-type: disc;padding: 0px 0px 0px 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;' class=\"list-paddingleft-1\">\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cspan leaf=\"\">Sample Audio\u003C/span>\u003C/section>\u003Csection nodeleaf=\"\">\u003Cmp-common-mpaudio class=\"js_editor_audio res_iframe js_uneditable custom_select_card\" data-pluginname=\"insertaudio\" name=\"sesame_conversation_01\" author=\"Renee 创业随笔\" src=\"./assets/17423769624110.38870096699451984.mp3\" isaac2=\"1\" low_size=\"27.16\" source_size=\"27.2\" high_size=\"54.04\" play_length=\"13000\" data-trans_state=\"1\" data-verify_state=\"3\" voice_encode_fileid=\"MzkwOTMzMzk0MV8yMjQ3NDk0NDc5\" cover=\"./assets/17423769624110.38870096699451984.jpeg\" data-uuid=\"17423769624110.38870096699451984\">\u003C/mp-common-mpaudio>\u003C/section>\u003C/li>\u003C/ul>\u003Cblockquote style='box-sizing: border-box;margin: 20px 0px;cursor: pointer;padding: 10px 10px 10px 20px;border-style: none none none solid;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0.05);width: auto;height: auto;box-shadow: rgba(0, 0, 0, 0) 0px 0px 0px 0px;display: block;overflow: auto;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cp style=\"box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;text-indent: 0em;padding: 8px 0px;font-weight: normal;\">\u003Cspan leaf=\"\">Note: Based on audio prompts from two speakers, the model generates a natural and smooth multi-speaker dialogue in a single pass.\u003C/span>\u003C/p>\u003C/blockquote>\u003Ch3 style='box-sizing: border-box;margin: 30px 0px 15px;color: rgba(0, 0, 0, 0.85);font-weight: 500;cursor: pointer;padding: 0px;display: block;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan style=\"box-sizing: border-box;cursor: pointer;font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;text-align: left;font-weight: bold;display: block;\">\u003Cspan leaf=\"\">Model Evaluation\u003C/span>\u003C/span>\u003C/h3>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">To objectively evaluate the performance of the CSM model, both objective and subjective evaluation methods were employed. Objective evaluations used traditional metrics (such as Word Error Rate) and novel semantic and pronunciation tests; subjective evaluations utilized the Expresso dataset, where listeners provided comparative subjective scores (CMOS) to measure the model's performance in emotional expression and contextual appropriateness.\u003C/span>\u003C/p>\u003Ch4 style='box-sizing: border-box;margin: 30px 0px 15px;color: rgba(0, 0, 0, 0.85);font-weight: 500;cursor: pointer;padding: 0px;display: block;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan style=\"box-sizing: border-box;cursor: pointer;font-size: 18px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;text-align: left;font-weight: bold;display: block;\">\u003Cspan leaf=\"\">📌 Objective Evaluation\u003C/span>\u003C/span>\u003C/h4>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">Traditional evaluation metrics such as Word Error Rate (WER) and Speaker Similarity (SIM) have reached saturation, with modern models like CSM approaching human-level performance on these metrics.\u003C/span>\u003C/p>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">To further demonstrate the model's performance, the following two more challenging tests were introduced:\u003C/span>\u003C/p>\u003Cul style='box-sizing: border-box;margin: 8px 0px;cursor: pointer;list-style-type: disc;padding: 0px 0px 0px 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;' class=\"list-paddingleft-1\">\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cp style=\"box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;text-indent: 0em;padding: 8px 0px;\">\u003Cstrong style=\"box-sizing: border-box;font-weight: bold;cursor: pointer;color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;margin: 0px;padding: 0px;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003Cspan leaf=\"\">Homograph Disambiguation\u003C/span>\u003C/strong>\u003Cspan leaf=\"\">\u003Cbr>\u003C/span>\u003Cspan leaf=\"\">This test checks whether the model can correctly pronounce homographs, words that are spelled the same but pronounced differently in different contexts, such as the English word \"lead\" which can be pronounced /lɛd/ (metal lead) or /liːd/ (to lead).\u003C/span>\u003C/p>\u003C/section>\u003C/li>\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cp style=\"box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;text-indent: 0em;padding: 8px 0px;\">\u003Cstrong style=\"box-sizing: border-box;font-weight: bold;cursor: pointer;color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;margin: 0px;padding: 0px;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003Cspan leaf=\"\">Pronunciation Consistency Test\u003C/span>\u003C/strong>\u003Cspan leaf=\"\">\u003Cbr>\u003C/span>\u003Cspan leaf=\"\">This test evaluates the model's stability in pronouncing the same word in different contexts, such as the words \"route,\" \"data,\" and \"caramel,\" which have different common pronunciation variants in English.\u003C/span>\u003C/p>\u003C/section>\u003C/li>\u003C/ul>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">The following figure shows the performance comparison of various models in the above tests:\u003C/span>\u003C/p>\u003Cul style='box-sizing: border-box;margin: 8px 0px;cursor: pointer;list-style-type: disc;padding: 0px 0px 0px 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;' class=\"list-paddingleft-1\">\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cspan leaf=\"\">The left side shows the results of the Homograph test;\u003C/span>\u003C/section>\u003C/li>\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cspan leaf=\"\">The right side shows the results of the Consistency test.\u003C/span>\u003C/section>\u003C/li>\u003C/ul>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">(Comparison with the default configurations of Play.ht, Elevenlabs, and OpenAI models.)\u003C/span>\u003C/p>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">Overall, larger models tend to have higher pronunciation accuracy, which aligns with the team's hypothesis that larger models can produce more realistic speech synthesis.\u003C/span>\u003C/p>\u003Ch4 style='box-sizing: border-box;margin: 30px 0px 15px;color: rgba(0, 0, 0, 0.85);font-weight: 500;cursor: pointer;padding: 0px;display: block;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan style=\"box-sizing: border-box;cursor: pointer;font-size: 18px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;text-align: left;font-weight: bold;display: block;\">\u003Cspan leaf=\"\">📌 Subjective Evaluation (Expresso Dataset)\u003C/span>\u003C/span>\u003C/h4>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">For subjective evaluation, the Expresso dataset was chosen, which includes a wide variety of emotional and prosodic variations, making it ideal for assessing the naturalness and contextual appropriateness of speech. Listeners rated the model-generated audio against real human recordings on a 7-point scale (CMOS). Multiple listeners were invited, each evaluating an average of 15 samples.\u003C/span>\u003C/p>\u003Cul style='box-sizing: border-box;margin: 8px 0px;cursor: pointer;list-style-type: disc;padding: 0px 0px 0px 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;' class=\"list-paddingleft-1\">\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cstrong style=\"box-sizing: border-box;font-weight: bold;cursor: pointer;color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;margin: 0px;padding: 0px;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003Cspan leaf=\"\">No Context Condition\u003C/span>\u003C/strong>\u003Cspan leaf=\"\">: Listeners judged which audio sounded more like a real human voice without any specific context;\u003C/span>\u003C/section>\u003C/li>\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cstrong style=\"box-sizing: border-box;font-weight: bold;cursor: pointer;color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;margin: 0px;padding: 0px;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003Cspan leaf=\"\">With Context Condition\u003C/span>\u003C/strong>\u003Cspan leaf=\"\">: Listeners judged which audio was more suitable for the given context.\u003C/span>\u003C/section>\u003C/li>\u003C/ul>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">The results are as follows:\u003C/span>\u003C/p>\u003Cul style='box-sizing: border-box;margin: 8px 0px;cursor: pointer;list-style-type: disc;padding: 0px 0px 0px 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;' class=\"list-paddingleft-1\">\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cstrong style=\"box-sizing: border-box;font-weight: bold;cursor: pointer;color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;margin: 0px;padding: 0px;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003Cspan leaf=\"\">In the no-context scenario\u003C/span>\u003C/strong>\u003Cspan leaf=\"\">, the naturalness of the model-generated speech is already close to that of real human speech, with little difference between the models.\u003C/span>\u003C/section>\u003C/li>\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cstrong style=\"box-sizing: border-box;font-weight: bold;cursor: pointer;color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;margin: 0px;padding: 0px;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003Cspan leaf=\"\">In the context-based scenario\u003C/span>\u003C/strong>\u003Cspan leaf=\"\">, listeners tended to prefer real human recordings. This indicates that there is still a gap between the models and real humans in terms of prosody and contextual matching in conversational speech generation.\u003C/span>\u003C/section>\u003C/li>\u003C/ul>\u003Ch3 style='box-sizing: border-box;margin: 30px 0px 15px;color: rgba(0, 0, 0, 0.85);font-weight: 500;cursor: pointer;padding: 0px;display: block;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan style=\"box-sizing: border-box;cursor: pointer;font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;text-align: left;font-weight: bold;display: block;\">\u003Cspan leaf=\"\">📌 Model Limitations and Future Plans\u003C/span>\u003C/span>\u003C/h3>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">The current CSM model is primarily trained on English data, and although it has some cross-lingual capabilities due to the presence of a small amount of other language data in the dataset, it has not yet reached the desired level. Additionally, the model currently only achieves high-quality generation of text and speech content and cannot effectively model deeper dialogue structures, such as turn-taking, pacing, and pausing in multi-speaker conversations.\u003C/span>\u003C/p>\u003Cp style='box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;text-indent: 0px;padding: 8px 0px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;orphans: 2;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;'>\u003Cspan leaf=\"\">To further enhance the capabilities of CSM, the team plans to:\u003C/span>\u003C/p>\u003Cul style='box-sizing: border-box;margin: 8px 0px;cursor: pointer;list-style-type: disc;padding: 0px 0px 0px 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;font-style: normal;font-variant-ligatures: normal;font-variant-caps: normal;font-weight: 400;letter-spacing: normal;orphans: 2;text-align: left;text-indent: 0px;text-transform: none;widows: 2;word-spacing: 0px;-webkit-text-stroke-width: 0px;white-space: normal;background-color: rgb(255, 255, 255);text-decoration-thickness: initial;text-decoration-style: initial;text-decoration-color: initial;' class=\"list-paddingleft-1\">\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cp style=\"box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;text-indent: 0em;padding: 8px 0px;\">\u003Cstrong style=\"box-sizing: border-box;font-weight: bold;cursor: pointer;color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;margin: 0px;padding: 0px;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003Cspan leaf=\"\">Expand Multilingual Support\u003C/span>\u003C/strong>\u003Cspan leaf=\"\">\u003Cbr>\u003C/span>\u003Cspan leaf=\"\">Improve the model's performance in multilingual scenarios by training on more diverse datasets to enhance its cross-lingual capabilities.\u003C/span>\u003C/p>\u003C/section>\u003C/li>\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cp style=\"box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;text-indent: 0em;padding: 8px 0px;\">\u003Cstrong style=\"box-sizing: border-box;font-weight: bold;cursor: pointer;color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;margin: 0px;padding: 0px;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003Cspan leaf=\"\">Integrate Deep Text and Speech Interaction\u003C/span>\u003C/strong>\u003Cspan leaf=\"\">\u003Cbr>\u003C/span>\u003Cspan leaf=\"\">In the future, the CSM model will be further expanded to capture the full structure of multi-speaker conversations, including turn-taking and dialogue pacing.\u003C/span>\u003C/p>\u003C/section>\u003C/li>\u003Cli style=\"box-sizing: border-box;cursor: pointer;\">\u003Csection style=\"box-sizing: border-box;cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;font-weight: normal;\">\u003Cp style=\"box-sizing: border-box;margin: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: 0em;text-align: left;text-indent: 0em;padding: 8px 0px;\">\u003Cstrong style=\"box-sizing: border-box;font-weight: bold;cursor: pointer;color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;margin: 0px;padding: 0px;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003Cspan leaf=\"\">Combine the Advantages of Speech and Text\u003C/span>\u003C/strong>\u003Cspan leaf=\"\">\u003Cbr>\u003C/span>\u003Cspan leaf=\"\">The team is exploring a new AI architecture that allows the model to deeply understand both text and speech information, further narrowing the gap between generated and real human conversations.\u003C/span>\u003C/p>\u003C/section>\u003C/li>\u003C/ul>\u003Csection>\u003Cspan leaf=\"\">\u003Cbr>\u003C/span>\u003C/section>\u003Cp style=\"display: none;\">\u003Cmp-style-type data-value=\"3\">\u003C/mp-style-type>\u003C/p>\u003C/div>",[257,266,275,283,291,299,307,316],{"id":258,"title_md5":259,"publish_date":260,"author_md5":261,"is_original":23,"collection":262,"summary_md5":263,"cover_url":264,"cover_url_1_1":265},246,"f64b1c03cdcc0522b4a051af2822f9b5","2024-06-14","bc27fa490c4d0d525bac812fc0793534","#AI Game","8b50e643a66c0ba17ef2a95608f33f5f","article_res/cover/bb761d6a72b8800406032d2cfadc1eca.jpeg","article_res/cover/d9e1cb5455d969e3e1392af062a8d9c1.jpeg",{"id":267,"title_md5":268,"publish_date":269,"author_md5":270,"is_original":4,"collection":271,"summary_md5":272,"cover_url":273,"cover_url_1_1":274},167,"95cbd9ae5b6852d31fc2bb9230b718a6","2024-09-27","fb0edf26dea7e6e8b89a99bc9d3a3170","#History of Intelligence #Neuroscience","354f3ce13d05d28757e3a6f5de37a11c","article_res/cover/f02a6c714df40e27d49925fca276adde.jpeg","article_res/cover/fbd1293bad410b5df2c4b93c4aaee342.jpeg",{"id":276,"title_md5":277,"publish_date":278,"author_md5":261,"is_original":23,"collection":279,"summary_md5":280,"cover_url":281,"cover_url_1_1":282},50,"3406d80eb37f81951f4d94b9bdd39b01","2025-02-03","#OpenAI #LLM #AI Agent #AI Research #RL","497897b17a9ba507e8b4d37cf7e9ed16","article_res/cover/01d12853914b4549d0556b990ccbfb1b.jpeg","article_res/cover/7260f24c0367b431ad9245c97db8d4c6.jpeg",{"id":284,"title_md5":285,"publish_date":286,"author_md5":261,"is_original":23,"collection":287,"summary_md5":288,"cover_url":289,"cover_url_1_1":290},213,"57108a1562be4ac13caa0a6ba6d8182a","2024-07-22","#AI Avatar","d4316691cd09b681dca3192ed44d6d03","article_res/cover/2d2bca4edde06eec3d7208e1d73355ca.jpeg","article_res/cover/453434f4f2d1f4ef2fe91456155aba6c.jpeg",{"id":292,"title_md5":293,"publish_date":294,"author_md5":261,"is_original":23,"collection":295,"summary_md5":296,"cover_url":297,"cover_url_1_1":298},464,"0eaa87401709ebf317f13fab4f9096fb","2023-05-08","#LLM","ee1676ce5461bcd9dd0243bc9bb89c74","article_res/cover/e40e44c9ae820f61529e54673f114a97.jpeg","article_res/cover/38defbf768866b72f2d890ff3096d515.jpeg",{"id":300,"title_md5":301,"publish_date":302,"author_md5":261,"is_original":23,"collection":303,"summary_md5":304,"cover_url":305,"cover_url_1_1":306},122,"c66515089a71579f1a76107c65c16141","2024-11-22","#Prompt Engineering #Anthropic #Claude","c7c67b90f398f0f276e632c785ec1790","article_res/cover/b8ac5e872ecbdf416ab833583583357c.jpeg","article_res/cover/06f47cf55286a8a82aeea325077fdcbf.jpeg",{"id":308,"title_md5":309,"publish_date":310,"author_md5":311,"is_original":4,"collection":312,"summary_md5":313,"cover_url":314,"cover_url_1_1":315},488,"dee757a03ca9116c7196733bf2a33184","2023-04-06","7d83126d7919a9a379875f1ec38011de","#Psychology","5e79c1ccd849e8e8991ee62e8e9b71e9","article_res/cover/619fd1fb8865eba85732953da8b42712.jpeg","article_res/cover/490f118eaa3025fb9d0d9d42e06108e5.jpeg",{"id":317,"title_md5":318,"publish_date":319,"author_md5":261,"is_original":23,"collection":5,"summary_md5":320,"cover_url":321,"cover_url_1_1":322},435,"30f718a4e1d538989d8d1ed9af4d2e9f","2023-07-10","6b294c39a9df9494790472d6664a0981","article_res/cover/d7732186f0b72b21dd4b197148f47445.jpeg","article_res/cover/25e51a5a9b2738c39b68c6c832e8a8f9.jpeg",{"related":324,"small":365},[325,333,341,349,357],{"id":326,"publish_date":327,"is_original":23,"collection":328,"cover_url":329,"cover_url_1_1":330,"title":331,"summary":332,"author":28},132,"2024-11-09","#AI Code Generator","article_res/cover/f331019b22da4c731e5cbb950f2151cf.jpeg","article_res/cover/e0236f165f8989e216a34876d6afead7.jpeg","Bolt.new: A brand-new AI-driven development tool","What do you want to build? Prompt, run, edit, and deploy full-stack web apps.",{"id":334,"publish_date":335,"is_original":4,"collection":5,"cover_url":336,"cover_url_1_1":337,"title":338,"summary":339,"author":340},519,"2022-06-07","article_res/cover/a6e71ca7dc0b962f78e7c169ca6caf79.jpeg","article_res/cover/3c066e7f23b38b0f8e77d9b5be4911cd.jpeg","The relationship between art and science","For an Impressionist to paint from nature is not to paint the subject, but to realize sensations. \n- Paul Cezanne","Notes",{"id":342,"publish_date":343,"is_original":4,"collection":271,"cover_url":344,"cover_url_1_1":345,"title":346,"summary":347,"author":348},155,"2024-10-10","article_res/cover/10e1e9838c22fb47444422912cf8e145.jpeg","article_res/cover/dcfca8f9c278e98010611cbd5e1d69df.jpeg","【A Brief History of Intelligence】6. Speaking Language (Human)","What makes us humans unique is that we possess a \"rational soul\" — the ability to reason, think abstractly, and reflect.  \n-Aristotle","Notes on \"A Brief History of Intelligence\"",{"id":350,"publish_date":351,"is_original":23,"collection":352,"cover_url":353,"cover_url_1_1":354,"title":355,"summary":356,"author":28},468,"2023-05-04","#Stable Diffusion #AI Image Generator","article_res/cover/0dea50e91dff99427ad4f0d58b535a4b.jpeg","article_res/cover/0674573a06914817c27b45386ba37292.jpeg","DeepFloyd IF uses","Inspired by Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding",{"id":358,"publish_date":359,"is_original":23,"collection":360,"cover_url":361,"cover_url_1_1":362,"title":363,"summary":364,"author":28},28,"2025-02-24","#Andrej Karpathy #Deep Dive into LLMs #LLM #RL #DeepSeek","article_res/cover/db62a8c43fa565112c0aefa8776e1de2.jpeg","article_res/cover/cbb4289951cb6a0cda64f8fd913a2e23.jpeg","DeepSeek-R1 - Andrej Karpathy in-depth explanation of LLM (Part 9)","DeepSeek R1",[366,372,378],{"title":10,"list":367},[368,369,370,371],{"id":96,"publish_date":97,"is_original":23,"collection":98,"cover_url":99,"cover_url_1_1":100,"title":101,"summary":102,"author":28},{"id":104,"publish_date":105,"is_original":23,"collection":106,"cover_url":107,"cover_url_1_1":108,"title":109,"summary":110,"author":28},{"id":112,"publish_date":113,"is_original":23,"collection":114,"cover_url":115,"cover_url_1_1":116,"title":117,"summary":118,"author":28},{"id":166,"publish_date":167,"is_original":23,"collection":168,"cover_url":169,"cover_url_1_1":170,"title":171,"summary":172,"author":28},{"title":222,"list":373},[374,375,376,377],{"id":120,"publish_date":113,"is_original":23,"collection":121,"cover_url":122,"cover_url_1_1":123,"title":124,"summary":125,"author":28},{"id":166,"publish_date":167,"is_original":23,"collection":168,"cover_url":169,"cover_url_1_1":170,"title":171,"summary":172,"author":28},{"id":227,"publish_date":228,"is_original":23,"collection":229,"cover_url":230,"cover_url_1_1":231,"title":232,"summary":233,"author":28},{"id":235,"publish_date":236,"is_original":23,"collection":73,"cover_url":237,"cover_url_1_1":238,"title":239,"summary":240,"author":28},{"title":242,"list":379},[],[8,9,10],[8,12,13,14,9,10,15,16,17,18],["Reactive",245],1754646408333]