[{"data":1,"prerenderedAt":375},["ShallowReactive",2],{"$fgukOamtKU1RtUiMFsqdObttmqPPQz0uc7bl_gj_LyX0":3,"$f3iOIjuoLrKFdteDjX4mi1Yxxt5eDQBiCfuk1KolaEGo":245,"article-219":374},{"code":4,"msg":5,"data":6},0,"",{"category":7,"tag":11,"popular":19,"latest":86,"banner":126,"list":151,"cache":244},[8,9,10],"Agent","OpenAI","LLM",[8,12,13,14,9,10,15,16,17,18],"Google","Nvidia","Claude","DeepSeek","OCR","Chat","Generator",[20,29,37,45,54,62,70,79],{"id":21,"publish_date":22,"is_original":23,"collection":5,"cover_url":24,"cover_url_1_1":25,"title":26,"summary":27,"author":28},411,"2023-09-10",1,"article_res/cover/451ef50c225a8dc61c4336506794d13b.jpeg","article_res/cover/3ba9dc7a72f87d40b20fc2d225289ee3.jpeg","Idealism","Reality is created by the mind, we can change our reality by changing our mind. - Plato","Renee's Entrepreneurial Journey",{"id":30,"publish_date":31,"is_original":23,"collection":32,"cover_url":33,"cover_url_1_1":34,"title":35,"summary":36,"author":28},108,"2024-12-07","#LLM #AGI #AI Agent","article_res/cover/0039044422e4ec9f61c18e8ee1693bb0.jpeg","article_res/cover/4220971b108a91d21407d87bb02fbaa6.jpeg","Freysa.ai: The World's First Adversarial AI Agent Game","说服 Freysa 把钱包里的钱都拿出来",{"id":38,"publish_date":39,"is_original":23,"collection":40,"cover_url":41,"cover_url_1_1":42,"title":43,"summary":44,"author":28},12,"2025-03-09","#Oxford #Reasoning #LLM #Tool Use","article_res/cover/d448e9b3617a0b5302e1bd10c438bca9.jpeg","article_res/cover/864a468f9cc4c9317efadb3811909888.jpeg","Agentic Reasoning Framework - Significantly enhance the reasoning ability of LLMs through the integration of external tools using agents","Agentic Reasoning: Reasoning LLMs with Tools for Deep Research",{"id":46,"publish_date":47,"is_original":4,"collection":48,"cover_url":49,"cover_url_1_1":50,"title":51,"summary":52,"author":53},480,"2023-04-14","#Stable Diffusion","article_res/cover/0bdbe7cb1de4a78e54536e5d9afa7ec9.jpeg","article_res/cover/b3d6ffec0608dcfaf18c5a69906d1490.jpeg","【AIGC Learning】Generate Prompts Using Word Graphs - Stable Diffusion Web UI Series 13","AI will become a powerful tool in education, transforming the way we learn and deliver instruction.  \n- Reid Hoffman","--",{"id":55,"publish_date":56,"is_original":4,"collection":57,"cover_url":58,"cover_url_1_1":59,"title":60,"summary":61,"author":28},413,"2023-09-08","#Neuroscience","article_res/cover/74f8302d78a23d9430f22171eae136b6.jpeg","article_res/cover/87ca08af81bb304746be5261160964c0.jpeg","Can machines be conscious?","Do we have an ethical obligation to not turn off conscious machines? Would turning them off be murder? No. I don't lose any sleep over unplugging a conscious machine.\n- Jeff Hawkins, \"A Thousand Brains\"",{"id":63,"publish_date":64,"is_original":23,"collection":65,"cover_url":66,"cover_url_1_1":67,"title":68,"summary":69,"author":28},178,"2024-09-09","#Entrepreneurship","article_res/cover/a7224f025b55d1820408085faef63079.jpeg","article_res/cover/11a9995b096cbf64465ef01b8673b154.jpeg","37signals company","This damn sense of relaxation",{"id":71,"publish_date":72,"is_original":4,"collection":73,"cover_url":74,"cover_url_1_1":75,"title":76,"summary":77,"author":78},460,"2023-05-12","#Google","article_res/cover/b970687b12faa52da976f91248c2aa7b.jpeg","article_res/cover/d1e71b52cfd2c63bc6e71f3e85ff135c.jpeg","Learn what BRC-20 and Ordinals are using Google Bard","Ordinals - a new protocol that allows users to store arbitrary data on the Bitcoin blockchain","Google Bard mainly writes",{"id":80,"publish_date":81,"is_original":23,"collection":5,"cover_url":82,"cover_url_1_1":83,"title":84,"summary":85,"author":28},309,"2024-03-26","article_res/cover/9877f95894ee88532d0e6012c23a2df3.jpeg","article_res/cover/20092164ddc109ce6ae56b1984246751.jpeg","Learning the Cancun Upgrade with lepton and perplexity","Building a quick conversation-based search demo with Lepton AI.",[87,95,103,111,119],{"id":88,"publish_date":89,"is_original":23,"collection":90,"cover_url":91,"cover_url_1_1":92,"title":93,"summary":94,"author":28},627,"2025-03-20","#AI Avatar #AI Video Generation","article_res/cover/d95481358f73924989f8c4ee9c75d1c8.jpeg","article_res/cover/b74bc0fab01f8b6a6aa87696c0c3ed8b.jpeg","DisPose: Generating Animated Videos by Driving Video with Reference Images","DisPose is a controllable human image animation method that enhances video generation.",{"id":96,"publish_date":97,"is_original":23,"collection":98,"cover_url":99,"cover_url_1_1":100,"title":101,"summary":102,"author":28},626,"2025-03-21","#Deep Dive into LLMs #LLM #RL #Andrej Karpathy #AlphaGo","article_res/cover/446553a5c8f8f2f07d97b20eaee84e56.jpeg","article_res/cover/e6c2823409c9b34624064b9acbaca6f1.jpeg","AlphaGo and the Power of Reinforcement Learning - Andrej Karpathy's Deep Dive on LLMs (Part 9)","Simply learning from humans will never surpass human capabilities.",{"id":104,"publish_date":105,"is_original":23,"collection":106,"cover_url":107,"cover_url_1_1":108,"title":109,"summary":110,"author":28},625,"2025-03-22","#Deep Dive into LLMs #LLM #RL #RLHF #Andrej Karpathy","article_res/cover/8da81d38b1e5cf558a164710fd8a5389.jpeg","article_res/cover/96f028d76c362a99a0dd56389e8f7a9b.jpeg","Reinforcement Learning from Human Feedback (RLHF) - Andrej Karpathy's Deep Dive on LLMs (Part 10)","Fine-Tuning Language Models from Human Preferences",{"id":112,"publish_date":113,"is_original":23,"collection":114,"cover_url":115,"cover_url_1_1":116,"title":117,"summary":118,"author":28},624,"2025-03-23","#Deep Dive into LLMs #LLM #Andrej Karpathy #AI Agent #MMM","article_res/cover/a5e7c3d48bb09109684d6513287c661d.jpeg","article_res/cover/d3f22b7c0ab8d82fd2da457a299e0773.jpeg","The Future of Large Language Models - Andrej Karpathy's In-Depth Explanation of LLM (Part 11)","preview of things to come",{"id":120,"publish_date":113,"is_original":23,"collection":121,"cover_url":122,"cover_url_1_1":123,"title":124,"summary":125,"author":28},623,"#Google #Voe #AI Video Generation","article_res/cover/c44062fea0f336c2b96b3928292392c2.jpeg","article_res/cover/a041041c69092ad3db191c5bf3ff981b.jpeg","Trial of Google's video generation model VOE2","Our state-of-the-art video generation model",[127,135,143],{"id":128,"publish_date":129,"is_original":23,"collection":130,"cover_url":131,"cover_url_1_1":132,"title":133,"summary":134,"author":28},300,"2024-04-16","#AI in Science #AGI","article_res/cover/6bf01e793e0f33e848572412eebdf9b0.jpeg","article_res/cover/91a5ee21dafecb914fabeb9430d46ec1.jpeg","Would Einstein lose his job - AI and Quantum Computing: A Glimpse into the Near Future","So Einstein's job is still safe.",{"id":136,"publish_date":137,"is_original":23,"collection":138,"cover_url":139,"cover_url_1_1":140,"title":141,"summary":142,"author":28},101,"2024-12-14","#Nvidia #AI 3D Generator","article_res/cover/693e07c85980c5c0c8fde3f037733f23.jpeg","article_res/cover/9ea8edff2d5d303ff3fffff3f6f9c3d9.jpeg","NVIDIA's open-source 3D project LLaMA-Mesh","LLaMA-Mesh: Unifying 3D Mesh Generation with Language Models",{"id":144,"publish_date":145,"is_original":23,"collection":146,"cover_url":147,"cover_url_1_1":148,"title":149,"summary":150,"author":28},131,"2024-11-10","#OpenAI","article_res/cover/87f8ed353ce39f31960e7cdfaf075a35.jpeg","article_res/cover/f597a63935f5cd32e484b4aadd6019e8.jpeg","ChatGPT has launched the Search function","Get fast, timely answers with links to relevant web sources.",{"big":152,"small":214},[153,181],{"title":154,"list":155},"AGENT",[156,157,165,173],{"id":112,"publish_date":113,"is_original":23,"collection":114,"cover_url":115,"cover_url_1_1":116,"title":117,"summary":118,"author":28},{"id":158,"publish_date":159,"is_original":23,"collection":160,"cover_url":161,"cover_url_1_1":162,"title":163,"summary":164,"author":28},622,"2025-03-24","#OWL #AI Agent #MAS #MCP #CUA","article_res/cover/cb50ca7f2bf4d1ed50202d7406e1c19a.jpeg","article_res/cover/4aa7aa3badfacf3cc84121334f1050dd.jpeg","OWL: Multi-agent collaboration","OWL: Optimized Workforce Learning for General Multi-Agent Assistance in Real-World Task Automation",{"id":166,"publish_date":167,"is_original":23,"collection":168,"cover_url":169,"cover_url_1_1":170,"title":171,"summary":172,"author":28},620,"2025-03-26","#LLM #Google #Gemini #AI Agent","article_res/cover/53751a6dbbe990b1eb0b63f3b062aed4.jpeg","article_res/cover/031344981f0a212ff82d1f3a64aa5756.jpeg","Gemini 2.5 Pro, claimed to be far ahead of the competition, has been released with great fanfare: comprehensively surpassing other LLMs and topping the global rankings","Gemini 2.5: Our most intelligent AI model",{"id":174,"publish_date":175,"is_original":23,"collection":176,"cover_url":177,"cover_url_1_1":178,"title":179,"summary":180,"author":28},616,"2025-03-29","#MAS #AI Agent #AI Coder #MetaGPT #MGX","article_res/cover/9dcd702ad2035902e5e77967c34a1f1e.jpeg","article_res/cover/0a97fc4a922753c8f46ff38792020df8.jpeg","MGX - An automated website-building platform composed of multiple AI Agents","Your 24/7 AI Team | Dream, Chat, Create.",{"title":182,"list":183},"OPENAI",[184,191,199,206],{"id":185,"publish_date":167,"is_original":23,"collection":186,"cover_url":187,"cover_url_1_1":188,"title":189,"summary":190,"author":28},619,"#OpenAI #AI Image Generator #4o #MMM #AR Transformer","article_res/cover/2faffc97fcecf3151552cb0fd3206d89.jpeg","article_res/cover/1133cb4948af44cee2e7fbe79efb69e5.jpeg","The native image function of GPT-4o is officially launched","Introducing 4o Image Generation",{"id":192,"publish_date":193,"is_original":4,"collection":194,"cover_url":195,"cover_url_1_1":196,"title":197,"summary":198,"author":28},434,"2023-07-15","#Anthropic #OpenAI #Google #AI Code Generator #Claude","article_res/cover/e1b6f600a2b9f262a4392684e5f2ce25.jpeg","article_res/cover/6e1772e83f78f9a351ab23d3e414adee.jpeg","Latest Updates on Google Bard /Anthropic Claude2 / ChatGPT Code Interpreter","We want our models to use their programming skills to provide more natural interfaces to the basic functions of our computers.  \n - OpenAI",{"id":200,"publish_date":201,"is_original":4,"collection":146,"cover_url":202,"cover_url_1_1":203,"title":204,"summary":205,"author":28},417,"2023-08-24","article_res/cover/bccf897d50a88b18364e35f7466387e0.jpeg","article_res/cover/2f871085c1073717c1703ae86e18056f.jpeg","The GPT-3.5 Turbo fine-tuning (fine-tuning function) has been released～","Developers can now bring their own data to customize GPT-3.5 Turbo for their use cases.",{"id":207,"publish_date":208,"is_original":4,"collection":209,"cover_url":210,"cover_url_1_1":211,"title":212,"summary":213,"author":28},407,"2023-09-22","#OpenAI #AI Image Generator","article_res/cover/c59005e903d35cfc32346e2756e2728a.jpeg","article_res/cover/ba011d265e6d84b5c8cb6fd6b757b6cc.jpeg","Dall-E 3","DALL·E 3 understands significantly more nuance and detail, allowing you to easily translate your ideas into images.",[215,221,241],{"title":10,"list":216},[217,218,219,220],{"id":96,"publish_date":97,"is_original":23,"collection":98,"cover_url":99,"cover_url_1_1":100,"title":101,"summary":102,"author":28},{"id":104,"publish_date":105,"is_original":23,"collection":106,"cover_url":107,"cover_url_1_1":108,"title":109,"summary":110,"author":28},{"id":112,"publish_date":113,"is_original":23,"collection":114,"cover_url":115,"cover_url_1_1":116,"title":117,"summary":118,"author":28},{"id":166,"publish_date":167,"is_original":23,"collection":168,"cover_url":169,"cover_url_1_1":170,"title":171,"summary":172,"author":28},{"title":222,"list":223},"GOOGLE",[224,225,226,234],{"id":120,"publish_date":113,"is_original":23,"collection":121,"cover_url":122,"cover_url_1_1":123,"title":124,"summary":125,"author":28},{"id":166,"publish_date":167,"is_original":23,"collection":168,"cover_url":169,"cover_url_1_1":170,"title":171,"summary":172,"author":28},{"id":227,"publish_date":228,"is_original":23,"collection":229,"cover_url":230,"cover_url_1_1":231,"title":232,"summary":233,"author":28},615,"2025-03-30","#AI Researcher #AI Science #HKU #Google #AI Agent","article_res/cover/21fadf906067714bb0db31ae13a77c15.jpeg","article_res/cover/2697999a72bd26b22e85f0e92936d3ed.jpeg","AI-Researcher: LLM-driven全自动 scientific research assistant","AI-Researcher: Fully-Automated Scientific Discovery with LLM Agents  \nOpen-Sourced Alternative to Google AI Co-Scientist",{"id":235,"publish_date":236,"is_original":23,"collection":73,"cover_url":237,"cover_url_1_1":238,"title":239,"summary":240,"author":28},463,"2023-05-09","article_res/cover/89800f207723acdb55fc53bf999ebdc9.jpeg","article_res/cover/5764f369b4accd8f83e94aa4c077a175.jpeg","The Smallville sandbox world - A town with 25 virtual residents","Believable proxies of human behavior can empower interactive apps: Immersive environment, Rehearsal space, Prototyping tool",{"title":242,"list":243},"NVIDIA",[],true,{"code":4,"msg":5,"data":246},{"id":247,"publish_date":248,"is_original":23,"collection":249,"articles_id":250,"cover_url":251,"cover_url_1_1":252,"title":253,"summary":254,"author":28,"content":255,"popular":256,"list":321,"category":372,"tag":373},219,"2024-07-15","#Google #AI Audio Generator #AI Sound Generator","U4F8N_gExG6-XJs189IuKw","article_res/cover/9e71dbefd6443d046a301e634dd877f1.jpeg","article_res/cover/05643a41d0389cf6b560c03899a3258e.jpeg","Google DeepMind's Video-to-audio research - dubbing for videos","Generating audio for video","\u003Cdiv class=\"rich_media_content js_underline_content\n                       autoTypeSetting24psection\n            \" id=\"js_content\">\u003Cp style='margin-bottom: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;text-wrap: wrap;background-color: rgb(255, 255, 255);'>V2A combines video pixels with natural language text prompts to generate rich soundscapes for on-screen actions. V2A technology can be paired with video generation models like Veo to create scenes with dramatic scores, realistic sound effects, or dialogues matching the video characters and tone; it can also generate soundtracks for various traditional materials, including archival footage and silent films, opening up broader creative opportunities.\u003C/p>\u003Cp style='margin-bottom: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;text-wrap: wrap;background-color: rgb(255, 255, 255);'>Prompt for audio: \u003Ccode style=\"font-family: Consolas, Monaco, Menlo, monospace;font-size: 14px;cursor: pointer;color: rgb(30, 107, 184);line-height: 1.8em;letter-spacing: 0em;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(27, 31, 35, 0.05);width: auto;height: auto;margin-right: 2px;margin-left: 2px;padding: 2px 4px;border-style: none;border-width: 3px;border-color: rgb(0, 0, 0) rgba(0, 0, 0, 0.4) rgba(0, 0, 0, 0.4);border-radius: 4px;word-break: break-all;\">Cinematic, thriller, horror film, music, tension, ambience, footsteps on concrete\u003C/code>\u003C/p>\u003Csection>\u003Cdiv style=\"height: 508px; background: rgb(0, 0, 0); border-radius: 4px; overflow: hidden; margin-bottom: 12px;\">\u003Cvideo src=\"./assets/17423803326130.8936747697373413.mp4\" poster=\"./assets/17423803326170.5071312057652162.jpeg\" controls=\"\" style=\"width: 100%;height: 100%;\">\u003C/video>\u003C/div>\u003C/section>\u003Csection>\u003Cspan style='background-color: rgb(255, 255, 255);color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;letter-spacing: normal;text-align: left;font-weight: bold;'>This research differs from existing video-to-audio solutions because it is capable of understanding raw pixels, and adding text prompts is optional.\u003C/span>\u003Cspan style='background-color: rgb(255, 255, 255);color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;letter-spacing: normal;text-align: left;font-weight: bold;'>Additionally, this system does not require manual adjustments to align generated sounds with the video, which typically involves tedious adjustments of sound, visual, and temporal elements.\u003C/span>\u003C/section>\u003Csection>\u003Cdiv style=\"height: 508px; background: rgb(0, 0, 0); border-radius: 4px; overflow: hidden; margin-bottom: 12px;\">\u003Cvideo src=\"./assets/17423803326140.5266382606173969.mp4\" poster=\"./assets/17423803325920.22418150545542503.jpeg\" controls=\"\" style=\"width: 100%;height: 100%;\">\u003C/video>\u003C/div>\u003C/section>\u003Ch3 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);cursor: pointer;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;text-wrap: wrap;background-color: rgb(255, 255, 255);'>\u003Cspan style=\"cursor: pointer;font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">Enhanced creative control\u003C/span>\u003C/h3>\u003Cp style='margin-bottom: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;text-wrap: wrap;background-color: rgb(255, 255, 255);'>Importantly, V2A can generate an infinite number of soundtracks for any video input. Users can choose to define \"positive prompts\" to guide the generation of desired sounds, or \"negative prompts\" to avoid undesired sounds. This flexibility gives users more control over V2A's audio output, allowing them to quickly experiment with different audio outputs and select the best match.\u003C/p>\u003Cp style='margin-bottom: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;text-wrap: wrap;background-color: rgb(255, 255, 255);'>Prompt for audio: \u003Ccode style=\"font-family: Consolas, Monaco, Menlo, monospace;font-size: 14px;cursor: pointer;color: rgb(30, 107, 184);line-height: 1.8em;letter-spacing: 0em;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(27, 31, 35, 0.05);width: auto;height: auto;margin-right: 2px;margin-left: 2px;padding: 2px 4px;border-style: none;border-width: 3px;border-color: rgb(0, 0, 0) rgba(0, 0, 0, 0.4) rgba(0, 0, 0, 0.4);border-radius: 4px;word-break: break-all;\">A spaceship hurtles through the vastness of space, stars streaking past it, high speed, Sci-fi\u003C/code>\u003Cspan style='color: rgba(0, 0, 0, 0.9);font-family: mp-quote, -apple-system-font, BlinkMacSystemFont, \"Helvetica Neue\", \"PingFang SC\", \"Hiragino Sans GB\", \"Microsoft YaHei UI\", \"Microsoft YaHei\", Arial, sans-serif;font-size: var(--articleFontsize);letter-spacing: 0.034em;text-align: justify;'>\u003C/span>\u003C/p>\u003Csection>\u003Cdiv style=\"height: 508px; background: rgb(0, 0, 0); border-radius: 4px; overflow: hidden; margin-bottom: 12px;\">\u003Cvideo src=\"./assets/17423803329190.05777634559411471.mp4\" poster=\"./assets/17423803325960.5586004105490692.jpeg\" controls=\"\" style=\"width: 100%;height: 100%;\">\u003C/video>\u003C/div>\u003C/section>\u003Cp style='margin-bottom: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;text-wrap: wrap;background-color: rgb(255, 255, 255);'>Prompt for audio: \u003Ccode style=\"letter-spacing: 0em;font-family: Consolas, Monaco, Menlo, monospace;font-size: 14px;cursor: pointer;color: rgb(30, 107, 184);line-height: 1.8em;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(27, 31, 35, 0.05);width: auto;height: auto;margin-right: 2px;margin-left: 2px;padding: 2px 4px;border-style: none;border-width: 3px;border-color: rgb(0, 0, 0) rgba(0, 0, 0, 0.4) rgba(0, 0, 0, 0.4);border-radius: 4px;word-break: break-all;\">Ethereal cello atmosphere\u003C/code>\u003Cspan style='color: rgba(0, 0, 0, 0.9);font-family: mp-quote, -apple-system-font, BlinkMacSystemFont, \"Helvetica Neue\", \"PingFang SC\", \"Hiragino Sans GB\", \"Microsoft YaHei UI\", \"Microsoft YaHei\", Arial, sans-serif;font-size: var(--articleFontsize);letter-spacing: 0.034em;text-align: justify;'>\u003C/span>\u003C/p>\u003Csection>\u003Cdiv style=\"height: 508px; background: rgb(0, 0, 0); border-radius: 4px; overflow: hidden; margin-bottom: 12px;\">\u003Cvideo src=\"./assets/17423803348920.23301773291060224.mp4\" poster=\"./assets/17423803333100.4585527487860592.jpeg\" controls=\"\" style=\"width: 100%;height: 100%;\">\u003C/video>\u003C/div>\u003C/section>\u003Cp style='margin-bottom: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;text-wrap: wrap;background-color: rgb(255, 255, 255);'>Prompt for audio: \u003Ccode style=\"letter-spacing: 0em;font-family: Consolas, Monaco, Menlo, monospace;font-size: 14px;cursor: pointer;color: rgb(30, 107, 184);line-height: 1.8em;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(27, 31, 35, 0.05);width: auto;height: auto;margin-right: 2px;margin-left: 2px;padding: 2px 4px;border-style: none;border-width: 3px;border-color: rgb(0, 0, 0) rgba(0, 0, 0, 0.4) rgba(0, 0, 0, 0.4);border-radius: 4px;word-break: break-all;\">A spaceship hurtles through the vastness of space, stars streaking past it, high speed, Sci-fi\u003C/code>\u003C/p>\u003Csection>\u003Cdiv style=\"height: 508px; background: rgb(0, 0, 0); border-radius: 4px; overflow: hidden; margin-bottom: 12px;\">\u003Cvideo src=\"./assets/17423803358340.12488663267431255.mp4\" poster=\"./assets/17423803332280.22819085738996137.jpeg\" controls=\"\" style=\"width: 100%;height: 100%;\">\u003C/video>\u003C/div>\u003C/section>\u003Cp style='margin-bottom: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;text-wrap: wrap;background-color: rgb(255, 255, 255);'>\u003Cspan style=\"font-size: 20px;font-weight: bold;letter-spacing: 0em;\">\u003C/span>\u003C/p>\u003Cp style='margin-bottom: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;text-wrap: wrap;background-color: rgb(255, 255, 255);'>\u003Cspan style=\"font-size: 20px;font-weight: bold;letter-spacing: 0em;\">How it works\u003C/span>\u003C/p>\u003Cp style='margin-bottom: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;text-wrap: wrap;background-color: rgb(255, 255, 255);'>Experiments were conducted on autoregressive and diffusion methods to find the most scalable AI architecture. It was found that diffusion-based methods provided the most realistic and engaging results in audio generation, capable of synchronizing video and audio information. The V2A system first encodes the video input into a compressed representation, then the diffusion model iteratively refines audio from random noise. This process is guided by visual input and natural language prompts, generating synchronized and realistic audio closely aligned with the prompts. Finally, the audio output is decoded into audio waveforms and combined with the video data.\u003C/p>\u003Cp style=\"text-align: center;\">\u003Cimg class=\"rich_pages wxw-img js_insertlocalimg\" data-imgfileid=\"100005852\" data-ratio=\"0.562962962962963\" data-s=\"300,640\" data-type=\"png\" data-w=\"1080\" style=\"\" src=\"./assets/17423803359950.1623174016461837.png\">\u003C/p>\u003Ch3 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);cursor: pointer;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;text-wrap: wrap;background-color: rgb(255, 255, 255);'>\u003Cspan style=\"cursor: pointer;font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">\u003Cspan style=\"font-size: 16px;\">The V2A system generates audio waveforms synchronized with the video through video pixel and audio prompt inputs.\u003C/span>\u003Cspan style=\"font-size: 16px;\">First, V2A encodes the video and audio prompt inputs and processes them iteratively through a diffusion model.\u003C/span>\u003Cspan style=\"font-size: 16px;\">Then it generates compressed audio, which is decoded into audio waveforms.\u003C/span>\u003Cspan style=\"font-size: 16px;\">To generate higher-quality audio and increase the ability to guide the model to produce specific sounds, more information was added during training, including AI-generated annotations with detailed sound descriptions and dialogue records.\u003C/span>\u003Cspan style=\"font-size: 16px;\">By training on videos, audio, and additional annotations, our technology learns to associate specific audio events with various visual scenes while responding to the information provided in the annotations or dialogue records.\u003C/span>\u003Cbr>\u003C/span>\u003C/h3>\u003Ch3 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);cursor: pointer;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;text-wrap: wrap;background-color: rgb(255, 255, 255);'>\u003Cspan style=\"cursor: pointer;font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">Remaining challenges\u003C/span>\u003C/h3>\u003Cp style='margin-bottom: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;text-wrap: wrap;background-color: rgb(255, 255, 255);'>Since the quality of the audio output depends on the quality of the video input, artifacts or distortions in the video (parts outside the model's training distribution) may significantly degrade the audio quality. Lip synchronization in speaking videos is still being improved. V2A attempts to generate speech based on input dialogue records and synchronize it with the lip movements of the characters. However, the paired video generation model may not have been adjusted according to the dialogue record, leading to mismatches and often uncomfortable lip-sync issues due to the mismatch between the mouth movements generated by the video model and the dialogue record.\u003C/p>\u003Cp style='margin-bottom: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;text-wrap: wrap;background-color: rgb(255, 255, 255);'>Prompt for audio: \u003Ccode style=\"font-family: Consolas, Monaco, Menlo, monospace;font-size: 14px;cursor: pointer;color: rgb(30, 107, 184);line-height: 1.8em;letter-spacing: 0em;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(27, 31, 35, 0.05);width: auto;height: auto;margin-right: 2px;margin-left: 2px;padding: 2px 4px;border-style: none;border-width: 3px;border-color: rgb(0, 0, 0) rgba(0, 0, 0, 0.4) rgba(0, 0, 0, 0.4);border-radius: 4px;word-break: break-all;\">Music, Transcript: “this turkey looks amazing, I’m so hungry”\u003C/code>\u003C/p>\u003Csection>\u003Cdiv style=\"height: 508px; background: rgb(0, 0, 0); border-radius: 4px; overflow: hidden; margin-bottom: 12px;\">\u003Cvideo src=\"./assets/17423803344940.14785819259701682.mp4\" poster=\"./assets/17423803326460.5722455482395401.jpeg\" controls=\"\" style=\"width: 100%;height: 100%;\">\u003C/video>\u003C/div>\u003C/section>\u003Cp style=\"display: none;\">\u003Cmp-style-type data-value=\"3\">\u003C/mp-style-type>\u003C/p>\u003C/div>",[257,265,273,281,290,298,306,314],{"id":258,"title_md5":259,"publish_date":260,"author_md5":261,"is_original":23,"collection":5,"summary_md5":262,"cover_url":263,"cover_url_1_1":264},340,"5a360e5607ef5f6d3d0f805a3616a92f","2024-01-21","bc27fa490c4d0d525bac812fc0793534","97fea3d871bcad52c292a9b4b6309486","article_res/cover/5360ee5c76f60679ef0da213560eb0d1.jpeg","article_res/cover/268e6d62eef7bb4f20167ff77149a536.jpeg",{"id":266,"title_md5":267,"publish_date":268,"author_md5":261,"is_original":4,"collection":269,"summary_md5":270,"cover_url":271,"cover_url_1_1":272},389,"9b6c3c47bc4e7d991838999163c18798","2023-10-23","#Psychology #LLM","9a9f6d9fb26191d6c19cdc857d7d981f","article_res/cover/6e93f4660361e315031231d4495b6909.jpeg","article_res/cover/f0137b4d702612e32422491076d40496.jpeg",{"id":274,"title_md5":275,"publish_date":276,"author_md5":261,"is_original":23,"collection":277,"summary_md5":278,"cover_url":279,"cover_url_1_1":280},65,"355241d059bad6536d0bb9bea822f270","2025-01-20","#AI Image Generator #AI Video Generator #Krea #AI 3D Generator","29bfdafcda8f22bb9177d697aa2e5446","article_res/cover/aa9dbc8db793695f3f24fcf41afc1fee.jpeg","article_res/cover/512cc57558a4f8a8eff6a2acbdb816a3.jpeg",{"id":282,"title_md5":283,"publish_date":284,"author_md5":285,"is_original":4,"collection":286,"summary_md5":287,"cover_url":288,"cover_url_1_1":289},470,"ff7ad5d0851d7c9b449f77614e1a1900","2023-04-23","cfab1ba8c67c7c838db98d666f02a132","#LLM","02489f90a678bce97b5afd07a4ef8ec7","article_res/cover/61de40f2a7f07fb960c35a4ca349a8c8.jpeg","article_res/cover/405a4b56362021d9b10a18979f215b8e.jpeg",{"id":291,"title_md5":292,"publish_date":293,"author_md5":294,"is_original":4,"collection":5,"summary_md5":295,"cover_url":296,"cover_url_1_1":297},557,"4faa16c9ff4d87d59c3378a2a3e3a6cb","2022-04-29","8b3607d0f4181a3cb6ffdccf7185f09b","c58f6a7c526c526ce0bfda938cdc8eb1","article_res/cover/7a9b1375ed9bb298154981bae42b794d.jpeg","article_res/cover/afa281dd52bc0454e6735daa8e6b0706.jpeg",{"id":299,"title_md5":300,"publish_date":301,"author_md5":302,"is_original":4,"collection":5,"summary_md5":303,"cover_url":304,"cover_url_1_1":305},603,"b942a93a4869123008cee47039c6642d","2022-03-14","311a46cfdaa3afda544e9285644f70d7","3358d0f4fa8d1ecd938346891e2d7b26","article_res/cover/fc952f7551bc0bca96e637c2f15557a4.jpeg","article_res/cover/8b46b12fe2f1e894215fe1677d50351b.jpeg",{"id":307,"title_md5":308,"publish_date":309,"author_md5":261,"is_original":4,"collection":310,"summary_md5":311,"cover_url":312,"cover_url_1_1":313},396,"b332f78452326c35e2646bb29e60ffdf","2023-10-11","#OpenAI #Object Detection #Microsoft","8fb401d69da5891d1ceaa72e30bfc058","article_res/cover/d379f570aaa88563834ed46fb86ab642.jpeg","article_res/cover/ff604b16b7422451a14069a166c480c4.jpeg",{"id":315,"title_md5":316,"publish_date":317,"author_md5":261,"is_original":23,"collection":65,"summary_md5":318,"cover_url":319,"cover_url_1_1":320},210,"9bc74099e618927d861c48f879a48322","2024-07-29","58563034117b55c61c58e6b57f0e59aa","article_res/cover/6dc01d42989b05ebdba02ada625635a6.jpeg","article_res/cover/6fe27209dc0a730b623323f93576f60e.jpeg",{"related":322,"small":357},[323,331,338,345,349],{"id":324,"publish_date":325,"is_original":23,"collection":326,"cover_url":327,"cover_url_1_1":328,"title":329,"summary":330,"author":28},379,"2023-11-14","#Meta #Object Detection","article_res/cover/f4107a5c33083d2f1b89d05b3cef877f.jpeg","article_res/cover/cda4f1eb5a1c8fee8d24a9adc7909f06.jpeg","Explore DINOv2: Meta's breakthrough self-supervised visual model","DINOv2: State-of-the-art computer vision models with self-supervised learning",{"id":332,"publish_date":333,"is_original":23,"collection":5,"cover_url":334,"cover_url_1_1":335,"title":336,"summary":337,"author":28},482,"2023-04-12","article_res/cover/3bc5b47ff3b06f43fe40a062c0fb8483.jpeg","article_res/cover/1add310252a4aab7eb875fe2ddd913f6.jpeg","【AIGC Learning】Use Google Colab to directly converse with ChatGPT - ChatGPT API Usage 1","GPT-4 will massively speed your ability to do these things, and with greater breadth and scope.  \n- Reid Hoffman",{"id":339,"publish_date":340,"is_original":4,"collection":5,"cover_url":341,"cover_url_1_1":342,"title":343,"summary":344,"author":28},334,"2024-01-27","article_res/cover/559efa3a43c7fa1bcea1482e79158733.jpeg","article_res/cover/34e9689d58317acfe4660373d1e41c13.jpeg","DePIN in \"CRYPTO THESES 2024\"","I don’t think there is a more important area of long-term development than DePIN.",{"id":299,"publish_date":301,"is_original":4,"collection":5,"cover_url":304,"cover_url_1_1":305,"title":346,"summary":347,"author":348},"How to evaluate investments - Future Value and Present Value","Remember that stock prices are not about the past. They are a prediction of future cash flows discounted back to the present.","Course notes",{"id":350,"publish_date":351,"is_original":23,"collection":352,"cover_url":353,"cover_url_1_1":354,"title":355,"summary":356,"author":28},90,"2024-12-25","#AI Agent #Microsoft #Anthropic #LLM #Langchain","article_res/cover/6e5c663691d0ab5cd19bc6dbed6bc0f1.jpeg","article_res/cover/271445d9bce34ec6eff52b1fc876576d.jpeg","Building AI Agents","The road to AGI",[358,364,370],{"title":10,"list":359},[360,361,362,363],{"id":96,"publish_date":97,"is_original":23,"collection":98,"cover_url":99,"cover_url_1_1":100,"title":101,"summary":102,"author":28},{"id":104,"publish_date":105,"is_original":23,"collection":106,"cover_url":107,"cover_url_1_1":108,"title":109,"summary":110,"author":28},{"id":112,"publish_date":113,"is_original":23,"collection":114,"cover_url":115,"cover_url_1_1":116,"title":117,"summary":118,"author":28},{"id":166,"publish_date":167,"is_original":23,"collection":168,"cover_url":169,"cover_url_1_1":170,"title":171,"summary":172,"author":28},{"title":222,"list":365},[366,367,368,369],{"id":120,"publish_date":113,"is_original":23,"collection":121,"cover_url":122,"cover_url_1_1":123,"title":124,"summary":125,"author":28},{"id":166,"publish_date":167,"is_original":23,"collection":168,"cover_url":169,"cover_url_1_1":170,"title":171,"summary":172,"author":28},{"id":227,"publish_date":228,"is_original":23,"collection":229,"cover_url":230,"cover_url_1_1":231,"title":232,"summary":233,"author":28},{"id":235,"publish_date":236,"is_original":23,"collection":73,"cover_url":237,"cover_url_1_1":238,"title":239,"summary":240,"author":28},{"title":242,"list":371},[],[8,9,10],[8,12,13,14,9,10,15,16,17,18],["Reactive",245],1754646421358]