[{"data":1,"prerenderedAt":363},["ShallowReactive",2],{"$fgukOamtKU1RtUiMFsqdObttmqPPQz0uc7bl_gj_LyX0":3,"$fm0StoRIozgBh2Ncn6s35GYdfrb1CnKx4ICqs9CiER_Q":245,"article-76":362},{"code":4,"msg":5,"data":6},0,"",{"category":7,"tag":11,"popular":19,"latest":86,"banner":126,"list":151,"cache":244},[8,9,10],"Agent","OpenAI","LLM",[8,12,13,14,9,10,15,16,17,18],"Google","Nvidia","Claude","DeepSeek","OCR","Chat","Generator",[20,29,37,45,54,62,70,79],{"id":21,"publish_date":22,"is_original":23,"collection":5,"cover_url":24,"cover_url_1_1":25,"title":26,"summary":27,"author":28},411,"2023-09-10",1,"article_res/cover/451ef50c225a8dc61c4336506794d13b.jpeg","article_res/cover/3ba9dc7a72f87d40b20fc2d225289ee3.jpeg","Idealism","Reality is created by the mind, we can change our reality by changing our mind. - Plato","Renee's Entrepreneurial Journey",{"id":30,"publish_date":31,"is_original":23,"collection":32,"cover_url":33,"cover_url_1_1":34,"title":35,"summary":36,"author":28},108,"2024-12-07","#LLM #AGI #AI Agent","article_res/cover/0039044422e4ec9f61c18e8ee1693bb0.jpeg","article_res/cover/4220971b108a91d21407d87bb02fbaa6.jpeg","Freysa.ai: The World's First Adversarial AI Agent Game","说服 Freysa 把钱包里的钱都拿出来",{"id":38,"publish_date":39,"is_original":23,"collection":40,"cover_url":41,"cover_url_1_1":42,"title":43,"summary":44,"author":28},12,"2025-03-09","#Oxford #Reasoning #LLM #Tool Use","article_res/cover/d448e9b3617a0b5302e1bd10c438bca9.jpeg","article_res/cover/864a468f9cc4c9317efadb3811909888.jpeg","Agentic Reasoning Framework - Significantly enhance the reasoning ability of LLMs through the integration of external tools using agents","Agentic Reasoning: Reasoning LLMs with Tools for Deep Research",{"id":46,"publish_date":47,"is_original":4,"collection":48,"cover_url":49,"cover_url_1_1":50,"title":51,"summary":52,"author":53},480,"2023-04-14","#Stable Diffusion","article_res/cover/0bdbe7cb1de4a78e54536e5d9afa7ec9.jpeg","article_res/cover/b3d6ffec0608dcfaf18c5a69906d1490.jpeg","【AIGC Learning】Generate Prompts Using Word Graphs - Stable Diffusion Web UI Series 13","AI will become a powerful tool in education, transforming the way we learn and deliver instruction.  \n- Reid Hoffman","--",{"id":55,"publish_date":56,"is_original":4,"collection":57,"cover_url":58,"cover_url_1_1":59,"title":60,"summary":61,"author":28},413,"2023-09-08","#Neuroscience","article_res/cover/74f8302d78a23d9430f22171eae136b6.jpeg","article_res/cover/87ca08af81bb304746be5261160964c0.jpeg","Can machines be conscious?","Do we have an ethical obligation to not turn off conscious machines? Would turning them off be murder? No. I don't lose any sleep over unplugging a conscious machine.\n- Jeff Hawkins, \"A Thousand Brains\"",{"id":63,"publish_date":64,"is_original":23,"collection":65,"cover_url":66,"cover_url_1_1":67,"title":68,"summary":69,"author":28},178,"2024-09-09","#Entrepreneurship","article_res/cover/a7224f025b55d1820408085faef63079.jpeg","article_res/cover/11a9995b096cbf64465ef01b8673b154.jpeg","37signals company","This damn sense of relaxation",{"id":71,"publish_date":72,"is_original":4,"collection":73,"cover_url":74,"cover_url_1_1":75,"title":76,"summary":77,"author":78},460,"2023-05-12","#Google","article_res/cover/b970687b12faa52da976f91248c2aa7b.jpeg","article_res/cover/d1e71b52cfd2c63bc6e71f3e85ff135c.jpeg","Learn what BRC-20 and Ordinals are using Google Bard","Ordinals - a new protocol that allows users to store arbitrary data on the Bitcoin blockchain","Google Bard mainly writes",{"id":80,"publish_date":81,"is_original":23,"collection":5,"cover_url":82,"cover_url_1_1":83,"title":84,"summary":85,"author":28},309,"2024-03-26","article_res/cover/9877f95894ee88532d0e6012c23a2df3.jpeg","article_res/cover/20092164ddc109ce6ae56b1984246751.jpeg","Learning the Cancun Upgrade with lepton and perplexity","Building a quick conversation-based search demo with Lepton AI.",[87,95,103,111,119],{"id":88,"publish_date":89,"is_original":23,"collection":90,"cover_url":91,"cover_url_1_1":92,"title":93,"summary":94,"author":28},627,"2025-03-20","#AI Avatar #AI Video Generation","article_res/cover/d95481358f73924989f8c4ee9c75d1c8.jpeg","article_res/cover/b74bc0fab01f8b6a6aa87696c0c3ed8b.jpeg","DisPose: Generating Animated Videos by Driving Video with Reference Images","DisPose is a controllable human image animation method that enhances video generation.",{"id":96,"publish_date":97,"is_original":23,"collection":98,"cover_url":99,"cover_url_1_1":100,"title":101,"summary":102,"author":28},626,"2025-03-21","#Deep Dive into LLMs #LLM #RL #Andrej Karpathy #AlphaGo","article_res/cover/446553a5c8f8f2f07d97b20eaee84e56.jpeg","article_res/cover/e6c2823409c9b34624064b9acbaca6f1.jpeg","AlphaGo and the Power of Reinforcement Learning - Andrej Karpathy's Deep Dive on LLMs (Part 9)","Simply learning from humans will never surpass human capabilities.",{"id":104,"publish_date":105,"is_original":23,"collection":106,"cover_url":107,"cover_url_1_1":108,"title":109,"summary":110,"author":28},625,"2025-03-22","#Deep Dive into LLMs #LLM #RL #RLHF #Andrej Karpathy","article_res/cover/8da81d38b1e5cf558a164710fd8a5389.jpeg","article_res/cover/96f028d76c362a99a0dd56389e8f7a9b.jpeg","Reinforcement Learning from Human Feedback (RLHF) - Andrej Karpathy's Deep Dive on LLMs (Part 10)","Fine-Tuning Language Models from Human Preferences",{"id":112,"publish_date":113,"is_original":23,"collection":114,"cover_url":115,"cover_url_1_1":116,"title":117,"summary":118,"author":28},624,"2025-03-23","#Deep Dive into LLMs #LLM #Andrej Karpathy #AI Agent #MMM","article_res/cover/a5e7c3d48bb09109684d6513287c661d.jpeg","article_res/cover/d3f22b7c0ab8d82fd2da457a299e0773.jpeg","The Future of Large Language Models - Andrej Karpathy's In-Depth Explanation of LLM (Part 11)","preview of things to come",{"id":120,"publish_date":113,"is_original":23,"collection":121,"cover_url":122,"cover_url_1_1":123,"title":124,"summary":125,"author":28},623,"#Google #Voe #AI Video Generation","article_res/cover/c44062fea0f336c2b96b3928292392c2.jpeg","article_res/cover/a041041c69092ad3db191c5bf3ff981b.jpeg","Trial of Google's video generation model VOE2","Our state-of-the-art video generation model",[127,135,143],{"id":128,"publish_date":129,"is_original":23,"collection":130,"cover_url":131,"cover_url_1_1":132,"title":133,"summary":134,"author":28},300,"2024-04-16","#AI in Science #AGI","article_res/cover/6bf01e793e0f33e848572412eebdf9b0.jpeg","article_res/cover/91a5ee21dafecb914fabeb9430d46ec1.jpeg","Would Einstein lose his job - AI and Quantum Computing: A Glimpse into the Near Future","So Einstein's job is still safe.",{"id":136,"publish_date":137,"is_original":23,"collection":138,"cover_url":139,"cover_url_1_1":140,"title":141,"summary":142,"author":28},101,"2024-12-14","#Nvidia #AI 3D Generator","article_res/cover/693e07c85980c5c0c8fde3f037733f23.jpeg","article_res/cover/9ea8edff2d5d303ff3fffff3f6f9c3d9.jpeg","NVIDIA's open-source 3D project LLaMA-Mesh","LLaMA-Mesh: Unifying 3D Mesh Generation with Language Models",{"id":144,"publish_date":145,"is_original":23,"collection":146,"cover_url":147,"cover_url_1_1":148,"title":149,"summary":150,"author":28},131,"2024-11-10","#OpenAI","article_res/cover/87f8ed353ce39f31960e7cdfaf075a35.jpeg","article_res/cover/f597a63935f5cd32e484b4aadd6019e8.jpeg","ChatGPT has launched the Search function","Get fast, timely answers with links to relevant web sources.",{"big":152,"small":214},[153,181],{"title":154,"list":155},"AGENT",[156,157,165,173],{"id":112,"publish_date":113,"is_original":23,"collection":114,"cover_url":115,"cover_url_1_1":116,"title":117,"summary":118,"author":28},{"id":158,"publish_date":159,"is_original":23,"collection":160,"cover_url":161,"cover_url_1_1":162,"title":163,"summary":164,"author":28},622,"2025-03-24","#OWL #AI Agent #MAS #MCP #CUA","article_res/cover/cb50ca7f2bf4d1ed50202d7406e1c19a.jpeg","article_res/cover/4aa7aa3badfacf3cc84121334f1050dd.jpeg","OWL: Multi-agent collaboration","OWL: Optimized Workforce Learning for General Multi-Agent Assistance in Real-World Task Automation",{"id":166,"publish_date":167,"is_original":23,"collection":168,"cover_url":169,"cover_url_1_1":170,"title":171,"summary":172,"author":28},620,"2025-03-26","#LLM #Google #Gemini #AI Agent","article_res/cover/53751a6dbbe990b1eb0b63f3b062aed4.jpeg","article_res/cover/031344981f0a212ff82d1f3a64aa5756.jpeg","Gemini 2.5 Pro, claimed to be far ahead of the competition, has been released with great fanfare: comprehensively surpassing other LLMs and topping the global rankings","Gemini 2.5: Our most intelligent AI model",{"id":174,"publish_date":175,"is_original":23,"collection":176,"cover_url":177,"cover_url_1_1":178,"title":179,"summary":180,"author":28},616,"2025-03-29","#MAS #AI Agent #AI Coder #MetaGPT #MGX","article_res/cover/9dcd702ad2035902e5e77967c34a1f1e.jpeg","article_res/cover/0a97fc4a922753c8f46ff38792020df8.jpeg","MGX - An automated website-building platform composed of multiple AI Agents","Your 24/7 AI Team | Dream, Chat, Create.",{"title":182,"list":183},"OPENAI",[184,191,199,206],{"id":185,"publish_date":167,"is_original":23,"collection":186,"cover_url":187,"cover_url_1_1":188,"title":189,"summary":190,"author":28},619,"#OpenAI #AI Image Generator #4o #MMM #AR Transformer","article_res/cover/2faffc97fcecf3151552cb0fd3206d89.jpeg","article_res/cover/1133cb4948af44cee2e7fbe79efb69e5.jpeg","The native image function of GPT-4o is officially launched","Introducing 4o Image Generation",{"id":192,"publish_date":193,"is_original":4,"collection":194,"cover_url":195,"cover_url_1_1":196,"title":197,"summary":198,"author":28},434,"2023-07-15","#Anthropic #OpenAI #Google #AI Code Generator #Claude","article_res/cover/e1b6f600a2b9f262a4392684e5f2ce25.jpeg","article_res/cover/6e1772e83f78f9a351ab23d3e414adee.jpeg","Latest Updates on Google Bard /Anthropic Claude2 / ChatGPT Code Interpreter","We want our models to use their programming skills to provide more natural interfaces to the basic functions of our computers.  \n - OpenAI",{"id":200,"publish_date":201,"is_original":4,"collection":146,"cover_url":202,"cover_url_1_1":203,"title":204,"summary":205,"author":28},417,"2023-08-24","article_res/cover/bccf897d50a88b18364e35f7466387e0.jpeg","article_res/cover/2f871085c1073717c1703ae86e18056f.jpeg","The GPT-3.5 Turbo fine-tuning (fine-tuning function) has been released～","Developers can now bring their own data to customize GPT-3.5 Turbo for their use cases.",{"id":207,"publish_date":208,"is_original":4,"collection":209,"cover_url":210,"cover_url_1_1":211,"title":212,"summary":213,"author":28},407,"2023-09-22","#OpenAI #AI Image Generator","article_res/cover/c59005e903d35cfc32346e2756e2728a.jpeg","article_res/cover/ba011d265e6d84b5c8cb6fd6b757b6cc.jpeg","Dall-E 3","DALL·E 3 understands significantly more nuance and detail, allowing you to easily translate your ideas into images.",[215,221,241],{"title":10,"list":216},[217,218,219,220],{"id":96,"publish_date":97,"is_original":23,"collection":98,"cover_url":99,"cover_url_1_1":100,"title":101,"summary":102,"author":28},{"id":104,"publish_date":105,"is_original":23,"collection":106,"cover_url":107,"cover_url_1_1":108,"title":109,"summary":110,"author":28},{"id":112,"publish_date":113,"is_original":23,"collection":114,"cover_url":115,"cover_url_1_1":116,"title":117,"summary":118,"author":28},{"id":166,"publish_date":167,"is_original":23,"collection":168,"cover_url":169,"cover_url_1_1":170,"title":171,"summary":172,"author":28},{"title":222,"list":223},"GOOGLE",[224,225,226,234],{"id":120,"publish_date":113,"is_original":23,"collection":121,"cover_url":122,"cover_url_1_1":123,"title":124,"summary":125,"author":28},{"id":166,"publish_date":167,"is_original":23,"collection":168,"cover_url":169,"cover_url_1_1":170,"title":171,"summary":172,"author":28},{"id":227,"publish_date":228,"is_original":23,"collection":229,"cover_url":230,"cover_url_1_1":231,"title":232,"summary":233,"author":28},615,"2025-03-30","#AI Researcher #AI Science #HKU #Google #AI Agent","article_res/cover/21fadf906067714bb0db31ae13a77c15.jpeg","article_res/cover/2697999a72bd26b22e85f0e92936d3ed.jpeg","AI-Researcher: LLM-driven全自动 scientific research assistant","AI-Researcher: Fully-Automated Scientific Discovery with LLM Agents  \nOpen-Sourced Alternative to Google AI Co-Scientist",{"id":235,"publish_date":236,"is_original":23,"collection":73,"cover_url":237,"cover_url_1_1":238,"title":239,"summary":240,"author":28},463,"2023-05-09","article_res/cover/89800f207723acdb55fc53bf999ebdc9.jpeg","article_res/cover/5764f369b4accd8f83e94aa4c077a175.jpeg","The Smallville sandbox world - A town with 25 virtual residents","Believable proxies of human behavior can empower interactive apps: Immersive environment, Rehearsal space, Prototyping tool",{"title":242,"list":243},"NVIDIA",[],true,{"code":4,"msg":5,"data":246},{"id":247,"publish_date":248,"is_original":23,"collection":249,"articles_id":250,"cover_url":251,"cover_url_1_1":252,"title":253,"summary":254,"author":28,"content":255,"popular":256,"list":310,"category":360,"tag":361},76,"2025-01-07","#AI Video Generation #Google","ybGl8DFjHKjw4PUro9_m3w","article_res/cover/3613b008d99e99c267bce18c2c7f4003.jpeg","article_res/cover/055bce99535eb3d30085f963064e930a.jpeg","Vision Transformer (ViT)","An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale","\u003Cdiv class=\"rich_media_content js_underline_content\n                       autoTypeSetting24psection\n            \" id=\"js_content\">\u003Cp style='margin-bottom: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;background-color: rgb(255, 255, 255);'>the difference. Today, I learned that the latest Sora-like video generation technology mainly adopts Vision Transformer. I don't really understand it, and I might be explaining it incorrectly; it's mainly for my own learning purposes.\u003C/p>\u003Cp style=\"text-align: center;\">\u003Cimg class=\"rich_pages wxw-img js_insertlocalimg\" data-imgfileid=\"100008915\" data-ratio=\"0.33796296296296297\" data-s=\"300,640\" data-type=\"webp\" data-w=\"1080\" style=\"\" src=\"./assets/17423771980830.01775580687765821.jpeg\">\u003C/p>\u003Ch2 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);cursor: pointer;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\"cursor: pointer;font-size: 22px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">\u003Cstrong style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">Vision Transformer, Overview of ViT\u003C/strong>\u003C/span>\u003C/h2>\u003Cp style='margin-bottom: 0px;cursor: pointer;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;background-color: rgb(255, 255, 255);'>It is a model for image classification that processes image patches using an architecture similar to Transformers. ViT was first successfully applied to large-scale image recognition tasks in the paper \"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale\" published by Alexey Dosovitskiy et al. in 2020, showing excellent performance and promoting the development of visual representation learning and modern computer vision.\u003C/p>\u003Ch3 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);cursor: pointer;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\"cursor: pointer;font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">\u003Cstrong style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">Core Concept\u003C/strong>\u003C/span>\u003C/h3>\u003Cul style='margin-top: 8px;margin-bottom: 8px;cursor: pointer;padding-left: 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);' class=\"list-paddingleft-1\">\u003Cli style=\"cursor: pointer;\">\u003Csection style=\"cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">Divide the image into non-overlapping patches of fixed size (e.g., 16x16 pixels), and perform linear embedding after flattening each patch.\u003C/section>\u003C/li>\u003Cli style=\"cursor: pointer;\">\u003Csection style=\"cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">Add positional encoding to preserve spatial information, as the Transformer itself is not sensitive to the order of arrangement.\u003C/section>\u003C/li>\u003Cli style=\"cursor: pointer;\">\u003Csection style=\"cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">The sequence of embedded image patches is input into a standard Transformer encoder for processing.\u003C/section>\u003C/li>\u003Cli style=\"cursor: pointer;\">\u003Csection style=\"cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">A learnable [CLS] token is added to aggregate the full-image information for classification tasks.\u003C/section>\u003C/li>\u003C/ul>\u003Ch3 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);cursor: pointer;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\"cursor: pointer;font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">\u003Cstrong style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">Research Contributions\u003C/strong>\u003C/span>\u003C/h3>\u003Cul style='margin-top: 8px;margin-bottom: 8px;cursor: pointer;padding-left: 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);' class=\"list-paddingleft-1\">\u003Cli style=\"cursor: pointer;\">\u003Csection style=\"cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">It has been proven that without relying on Convolutional Neural Networks (CNNs), a pure Transformer architecture can also achieve excellent performance in image classification tasks.\u003C/section>\u003C/li>\u003Cli style=\"cursor: pointer;\">\u003Csection style=\"cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">After pre-training on large-scale datasets (such as ImageNet-21k), Vision Transformers (ViT) perform well when transferred to medium and small-scale image recognition benchmarks (such as ImageNet, CIFAR-100, VTAB), while requiring significantly fewer computational resources for training.\u003C/section>\u003C/li>\u003C/ul>\u003Ch2 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);cursor: pointer;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\"cursor: pointer;font-size: 22px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">\u003Cstrong style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">Detailed architecture of ViT\u003C/strong>\u003C/span>\u003C/h2>\u003Cp style=\"text-align: center;\">\u003Cimg class=\"rich_pages wxw-img js_insertlocalimg\" data-imgfileid=\"100008916\" data-ratio=\"0.7601246105919003\" data-s=\"300,640\" data-type=\"png\" data-w=\"642\" style=\"\" src=\"./assets/17423771981600.187318650988763.png\">\u003C/p>\u003Cspan style=\"cursor: pointer;font-size: 22px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">\u003Cstrong style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">\u003C/strong>\u003C/span>\u003Ch3 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);cursor: pointer;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\"cursor: pointer;font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">\u003Cstrong style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">1. Image processing flow\u003C/strong>\u003C/span>\u003C/h3>\u003Cul style='margin-top: 8px;margin-bottom: 8px;cursor: pointer;padding-left: 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);' class=\"list-paddingleft-1\">\u003Cli style=\"cursor: pointer;\">\u003Csection style=\"cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">Divide the input image into fixed-size, non-overlapping blocks (e.g., 16x16 pixels).\u003C/section>\u003C/li>\u003Cli style=\"cursor: pointer;\">\u003Csection style=\"cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">Each block is flattened and embedded as a vector through a linear layer.\u003C/section>\u003C/li>\u003Cli style=\"cursor: pointer;\">\u003Csection style=\"cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">Add absolute position encoding to each block embedding to retain spatial information.\u003C/section>\u003C/li>\u003Cli style=\"cursor: pointer;\">\u003Csection style=\"cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">Input the sequence of embeddings of all blocks into a standard Transformer encoder.\u003C/section>\u003C/li>\u003C/ul>\u003Ch3 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);cursor: pointer;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\"cursor: pointer;font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">\u003Cstrong style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">2. Classification mechanism\u003C/strong>\u003C/span>\u003C/h3>\u003Cul style='margin-top: 8px;margin-bottom: 8px;cursor: pointer;padding-left: 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);' class=\"list-paddingleft-1\">\u003Cli style=\"cursor: pointer;\">\u003Csection style=\"cursor: pointer;margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">A special [CLS] token is added to the input sequence, and after processing by the Transformer encoder, the output vector of this token is used for classification tasks.\u003C/section>\u003C/li>\u003C/ul>\u003Ch2 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);cursor: pointer;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\"cursor: pointer;font-size: 22px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">\u003Cstrong style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">Comparative analysis\u003C/strong>\u003C/span>\u003C/h2>\u003Ch3 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);cursor: pointer;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\"cursor: pointer;font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">\u003Cstrong style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">a. Architecture and design\u003C/strong>\u003C/span>\u003C/h3>\u003Csection style='cursor: pointer;margin-bottom: 0px;overflow-x: auto;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Ctable>\u003Cthead style=\"cursor: pointer;\">\u003Ctr style=\"cursor: pointer;\">\u003Cth style=\"cursor: pointer;line-height: 1.5em;letter-spacing: 0em;text-align: left;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgb(240, 240, 240);width: auto;height: auto;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;min-width: 85px;\">Characteristics\u003C/th>\u003Cth style=\"cursor: pointer;line-height: 1.5em;letter-spacing: 0em;text-align: left;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgb(240, 240, 240);width: auto;height: auto;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;min-width: 85px;\">Vision Transformer (ViT)\u003C/th>\u003Cth style=\"cursor: pointer;line-height: 1.5em;letter-spacing: 0em;text-align: left;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgb(240, 240, 240);width: auto;height: auto;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;min-width: 85px;\">Autoregressive Transformer (AR)\u003C/th>\u003Cth style=\"cursor: pointer;line-height: 1.5em;letter-spacing: 0em;text-align: left;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgb(240, 240, 240);width: auto;height: auto;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;min-width: 85px;\">Diffusion Transformer (DiT)\u003C/th>\u003C/tr>\u003C/thead>\u003Ctbody style=\"cursor: pointer;line-height: 1.5em;letter-spacing: 0em;border-width: 0px;border-style: initial;border-color: initial;\">\u003Ctr style=\"cursor: pointer;background-image: none;background-position: 0% 0%;background-size: auto;background-repeat: no-repeat;background-attachment: scroll;background-origin: padding-box;background-clip: border-box;width: auto;height: auto;\">\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">\u003Cstrong style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">Data processing\u003C/strong>\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Treating images as a sequence of patches\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Processing sequence data (text, images)\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Modeling data through noise perturbation and denoising\u003C/td>\u003C/tr>\u003Ctr style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgb(248, 248, 248);width: auto;height: auto;\">\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">\u003Cstrong style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">Positional encoding\u003C/strong>\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Crucial for spatial information\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Critical for maintaining sequence order\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Used to maintain structure during the diffusion process\u003C/td>\u003C/tr>\u003Ctr style=\"cursor: pointer;background-image: none;background-position: 0% 0%;background-size: auto;background-repeat: no-repeat;background-attachment: scroll;background-origin: padding-box;background-clip: border-box;width: auto;height: auto;\">\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">\u003Cstrong style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">Model components\u003C/strong>\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Block embedding, Transformer encoder\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Masked self-attention, Transformer decoder\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Transformer layers in diffusion steps\u003C/td>\u003C/tr>\u003Ctr style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgb(248, 248, 248);width: auto;height: auto;\">\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">\u003Cstrong style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">Generation capability\u003C/strong>\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Limited (mainly used for discriminative tasks)\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Strong generation capability\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Strong generation capability, with high fidelity\u003C/td>\u003C/tr>\u003C/tbody>\u003C/table>\u003C/section>\u003Ch3 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);cursor: pointer;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\"cursor: pointer;font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">\u003Cstrong style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">b. Application fields\u003C/strong>\u003C/span>\u003C/h3>\u003Csection style='cursor: pointer;margin-bottom: 0px;overflow-x: auto;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Ctable>\u003Cthead style=\"cursor: pointer;\">\u003Ctr style=\"cursor: pointer;\">\u003Cth style=\"cursor: pointer;line-height: 1.5em;letter-spacing: 0em;text-align: left;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgb(240, 240, 240);width: auto;height: auto;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;min-width: 85px;\">Application Field\u003C/th>\u003Cth style=\"cursor: pointer;line-height: 1.5em;letter-spacing: 0em;text-align: left;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgb(240, 240, 240);width: auto;height: auto;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;min-width: 85px;\">ViT\u003C/th>\u003Cth style=\"cursor: pointer;line-height: 1.5em;letter-spacing: 0em;text-align: left;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgb(240, 240, 240);width: auto;height: auto;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;min-width: 85px;\">Autoregressive Transformer (AR)\u003C/th>\u003Cth style=\"cursor: pointer;line-height: 1.5em;letter-spacing: 0em;text-align: left;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgb(240, 240, 240);width: auto;height: auto;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;min-width: 85px;\">Diffusion Transformer (DiT)\u003C/th>\u003C/tr>\u003C/thead>\u003Ctbody style=\"cursor: pointer;line-height: 1.5em;letter-spacing: 0em;border-width: 0px;border-style: initial;border-color: initial;\">\u003Ctr style=\"cursor: pointer;background-image: none;background-position: 0% 0%;background-size: auto;background-repeat: no-repeat;background-attachment: scroll;background-origin: padding-box;background-clip: border-box;width: auto;height: auto;\">\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">\u003Cstrong style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">Image Classification\u003C/strong>\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Main Purpose\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Less common, may be achievable on image sequences\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Typically not used for classification tasks\u003C/td>\u003C/tr>\u003Ctr style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgb(248, 248, 248);width: auto;height: auto;\">\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">\u003Cstrong style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">Image generation\u003C/strong>\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Limited, requires modification\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Effective when images are treated as sequences\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Highly efficient and state-of-the-art quality\u003C/td>\u003C/tr>\u003Ctr style=\"cursor: pointer;background-image: none;background-position: 0% 0%;background-size: auto;background-repeat: no-repeat;background-attachment: scroll;background-origin: padding-box;background-clip: border-box;width: auto;height: auto;\">\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">\u003Cstrong style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">Natural Language Processing\u003C/strong>\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Not directly applicable\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Core applications (such as GPT models)\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">With more limitations, unless integrated into multimodal models\u003C/td>\u003C/tr>\u003Ctr style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgb(248, 248, 248);width: auto;height: auto;\">\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">\u003Cstrong style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">Other fields\u003C/strong>\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Object detection, segmentation\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Music generation, code generation, etc.\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Audio synthesis, video generation, etc.\u003C/td>\u003C/tr>\u003C/tbody>\u003C/table>\u003C/section>\u003Ch3 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);cursor: pointer;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\"cursor: pointer;font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">\u003Cstrong style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">c. Advantages and merits\u003C/strong>\u003C/span>\u003C/h3>\u003Csection style='cursor: pointer;margin-bottom: 0px;overflow-x: auto;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Ctable>\u003Cthead style=\"cursor: pointer;\">\u003Ctr style=\"cursor: pointer;\">\u003Cth style=\"cursor: pointer;line-height: 1.5em;letter-spacing: 0em;text-align: left;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgb(240, 240, 240);width: auto;height: auto;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;min-width: 85px;\">Aspect\u003C/th>\u003Cth style=\"cursor: pointer;line-height: 1.5em;letter-spacing: 0em;text-align: left;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgb(240, 240, 240);width: auto;height: auto;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;min-width: 85px;\">ViT\u003C/th>\u003Cth style=\"cursor: pointer;line-height: 1.5em;letter-spacing: 0em;text-align: left;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgb(240, 240, 240);width: auto;height: auto;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;min-width: 85px;\">Autoregressive Transformer (AR)\u003C/th>\u003Cth style=\"cursor: pointer;line-height: 1.5em;letter-spacing: 0em;text-align: left;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgb(240, 240, 240);width: auto;height: auto;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;min-width: 85px;\">Diffusion Transformer (DiT)\u003C/th>\u003C/tr>\u003C/thead>\u003Ctbody style=\"cursor: pointer;line-height: 1.5em;letter-spacing: 0em;border-width: 0px;border-style: initial;border-color: initial;\">\u003Ctr style=\"cursor: pointer;background-image: none;background-position: 0% 0%;background-size: auto;background-repeat: no-repeat;background-attachment: scroll;background-origin: padding-box;background-clip: border-box;width: auto;height: auto;\">\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">\u003Cstrong style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">Performance\u003C/strong>\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Compete with CNNs in visual tasks\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Superior performance in generation tasks\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Leading in high-fidelity generation\u003C/td>\u003C/tr>\u003Ctr style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgb(248, 248, 248);width: auto;height: auto;\">\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">\u003Cstrong style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">Scalability\u003C/strong>\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Scales well with increasing data and model size\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Highly scalable, benefiting from large-scale datasets\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Extensible, but computationally intensive due to the multi-step diffusion process\u003C/td>\u003C/tr>\u003Ctr style=\"cursor: pointer;background-image: none;background-position: 0% 0%;background-size: auto;background-repeat: no-repeat;background-attachment: scroll;background-origin: padding-box;background-clip: border-box;width: auto;height: auto;\">\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">\u003Cstrong style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">Flexibility\u003C/strong>\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Mainly used for visual tasks, adaptable to some tasks\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Multifunctionality across multiple domains\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Mainly used for generative tasks, can be adapted through conditioning\u003C/td>\u003C/tr>\u003Ctr style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgb(248, 248, 248);width: auto;height: auto;\">\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">\u003Cstrong style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">Interpretability\u003C/strong>\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">The block-based method provides a certain level of interpretability\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">The sequential nature helps in understanding the generation process\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Due to the complexity of the diffusion process, it is more difficult to explain\u003C/td>\u003C/tr>\u003C/tbody>\u003C/table>\u003C/section>\u003Ch3 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);cursor: pointer;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\"cursor: pointer;font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">\u003Cstrong style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">d. Limitations and Challenges\u003C/strong>\u003C/span>\u003C/h3>\u003Csection style='cursor: pointer;margin-bottom: 0px;overflow-x: auto;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Ctable>\u003Cthead style=\"cursor: pointer;\">\u003Ctr style=\"cursor: pointer;\">\u003Cth style=\"cursor: pointer;line-height: 1.5em;letter-spacing: 0em;text-align: left;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgb(240, 240, 240);width: auto;height: auto;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;min-width: 85px;\">Aspect\u003C/th>\u003Cth style=\"cursor: pointer;line-height: 1.5em;letter-spacing: 0em;text-align: left;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgb(240, 240, 240);width: auto;height: auto;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;min-width: 85px;\">ViT\u003C/th>\u003Cth style=\"cursor: pointer;line-height: 1.5em;letter-spacing: 0em;text-align: left;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgb(240, 240, 240);width: auto;height: auto;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;min-width: 85px;\">Autoregressive Transformer (AR)\u003C/th>\u003Cth style=\"cursor: pointer;line-height: 1.5em;letter-spacing: 0em;text-align: left;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgb(240, 240, 240);width: auto;height: auto;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;min-width: 85px;\">Diffusion Transformer (DiT)\u003C/th>\u003C/tr>\u003C/thead>\u003Ctbody style=\"cursor: pointer;line-height: 1.5em;letter-spacing: 0em;border-width: 0px;border-style: initial;border-color: initial;\">\u003Ctr style=\"cursor: pointer;background-image: none;background-position: 0% 0%;background-size: auto;background-repeat: no-repeat;background-attachment: scroll;background-origin: padding-box;background-clip: border-box;width: auto;height: auto;\">\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">\u003Cstrong style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">Data efficiency\u003C/strong>\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Requires a large amount of data to perform well\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">May require a large amount of data, especially for long sequences\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Extremely requires data and computational resources\u003C/td>\u003C/tr>\u003Ctr style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgb(248, 248, 248);width: auto;height: auto;\">\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">\u003Cstrong style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">Computational cost\u003C/strong>\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Due to the Transformer layers, especially for high-resolution images, the computational cost is high\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Due to self-attention, the computational cost is high for long sequences\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">The computational cost is very high due to the iterative denoising steps.\u003C/td>\u003C/tr>\u003Ctr style=\"cursor: pointer;background-image: none;background-position: 0% 0%;background-size: auto;background-repeat: no-repeat;background-attachment: scroll;background-origin: padding-box;background-clip: border-box;width: auto;height: auto;\">\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">\u003Cstrong style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">Training complexity\u003C/strong>\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Training from scratch without pretraining can be challenging.\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Requires careful handling of sequence length and masking.\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Training is complex due to the dual process (diffusion and Transformer).\u003C/td>\u003C/tr>\u003Ctr style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgb(248, 248, 248);width: auto;height: auto;\">\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">\u003Cstrong style=\"cursor: pointer;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">Generation quality\u003C/strong>\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Limited compared to specialized generative models\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">Without sufficient training, it may be difficult to achieve high-fidelity generation\u003C/td>\u003Ctd style=\"cursor: pointer;min-width: 85px;border-color: rgba(204, 204, 204, 0.4);border-radius: 0px;\">May produce artifacts if improperly trained, but generally the quality is high\u003C/td>\u003C/tr>\u003C/tbody>\u003C/table>\u003C/section>\u003Cp>\u003Cbr>\u003C/p>\u003Cp style=\"display: none;\">\u003Cmp-style-type data-value=\"3\">\u003C/mp-style-type>\u003C/p>\u003C/div>",[257,266,269,277,280,287,294,302],{"id":258,"title_md5":259,"publish_date":260,"author_md5":261,"is_original":23,"collection":262,"summary_md5":263,"cover_url":264,"cover_url_1_1":265},617,"b504ee20a2da945d6448354924d11bb3","2025-03-28","bc27fa490c4d0d525bac812fc0793534","#Andrej Karpathy #Vibe Coding #AI Coding #LLM","7b72b4ce13d75569ff29c7503bd24fea","article_res/cover/a1ac79ba0f9750822ccf8bfe15607b9f.jpeg","article_res/cover/9caa1d90335c0ffa5f723922fc8a2121.jpeg",{"id":136,"title_md5":267,"publish_date":137,"author_md5":261,"is_original":23,"collection":138,"summary_md5":268,"cover_url":139,"cover_url_1_1":140},"91e5fe35f658bcf1607069e555cb5bed","e6f229e9aab5308ce7871e771838a307",{"id":270,"title_md5":271,"publish_date":272,"author_md5":273,"is_original":4,"collection":5,"summary_md5":274,"cover_url":275,"cover_url_1_1":276},577,"e049d7a99f585f82de08c36d769611a0","2022-04-09","46d4befbaca33274c83a26b5fc7c9d12","e0a47cf523942d77b05e0b98f3b5a980","article_res/cover/4482e63f9a8a649b2f272f91f453b607.jpeg","article_res/cover/764f78c1bb5669685f803ef084ad0725.jpeg",{"id":192,"title_md5":278,"publish_date":193,"author_md5":261,"is_original":4,"collection":194,"summary_md5":279,"cover_url":195,"cover_url_1_1":196},"a39f66d5a2901fac19dfcbdc8a1c163c","3383b510f8d3e2c32419df8cd3b321aa",{"id":281,"title_md5":282,"publish_date":283,"author_md5":261,"is_original":23,"collection":65,"summary_md5":284,"cover_url":285,"cover_url_1_1":286},442,"a0583e1b06f6df1c9341bc5d689426c4","2023-06-22","7fe2002395be0cb91f1f3a4017cbea4c","article_res/cover/2dd0c4e3d8f15e1059b9d9d6db552590.jpeg","article_res/cover/495132f609e2d722e8fb11ba67b7d945.jpeg",{"id":288,"title_md5":289,"publish_date":290,"author_md5":261,"is_original":23,"collection":291,"summary_md5":5,"cover_url":292,"cover_url_1_1":293},125,"5396b2bb9a0d6f91daee4940304fc984","2024-11-18","#Elon Musk","article_res/cover/cc525b90d4da39ba5439ef274393384b.jpeg","article_res/cover/36e89148a2a07d60e271128800a9ac08.jpeg",{"id":295,"title_md5":296,"publish_date":297,"author_md5":261,"is_original":23,"collection":298,"summary_md5":299,"cover_url":300,"cover_url_1_1":301},230,"2700effd8148411db487e01e22500d9c","2024-07-03","#AI Video Generator","7b90e56d79cc9896fe5cc115bf76ecf8","article_res/cover/cb99534c58c625dca7891a6293186620.jpeg","article_res/cover/c0329f1fab81e9034fa89c5cb51ff145.jpeg",{"id":303,"title_md5":304,"publish_date":305,"author_md5":261,"is_original":23,"collection":306,"summary_md5":307,"cover_url":308,"cover_url_1_1":309},224,"6750b90e233de5af6a744a3ec322b2c9","2024-07-09","#AI Avatar","aa36a9e66a4e0603e8124b0c52a669cf","article_res/cover/670fbb56c51f582e4b359a0b5c9eefd0.jpeg","article_res/cover/c9a84424a4c803f0e2565e847e9f36ac.jpeg",{"related":311,"small":345},[312,320,328,336,344],{"id":313,"publish_date":314,"is_original":4,"collection":5,"cover_url":315,"cover_url_1_1":316,"title":317,"summary":318,"author":319},529,"2022-05-27","article_res/cover/f8d7e7f132aa8a56092462fc630f5ba5.jpeg","article_res/cover/5d0b2e9be6bb118dea508cdc8a65eadd.jpeg","Decentralized Society: Finding Web3’s Soul / Decentralized Society: Seeking the Soul of Web3 -9","Help communities composed of imperfectly cooperative but socially connected individuals overcome their social differences","Translation",{"id":321,"publish_date":322,"is_original":23,"collection":323,"cover_url":324,"cover_url_1_1":325,"title":326,"summary":327,"author":28},37,"2025-02-16","#Deep Dive into LLMs #Andrej Karpathy #LLM #Self","article_res/cover/9af22e1c164374b6cda23c7c0fc490e0.jpeg","article_res/cover/004d36f7eed707f381f30466314ab99d.jpeg","\"Self-awareness\" of LLM - Andrej Karpathy's in-depth explanation of LLM (Part 5)","Knowledge of self",{"id":329,"publish_date":330,"is_original":4,"collection":331,"cover_url":332,"cover_url_1_1":333,"title":334,"summary":335,"author":28},389,"2023-10-23","#Psychology #LLM","article_res/cover/6e93f4660361e315031231d4495b6909.jpeg","article_res/cover/f0137b4d702612e32422491076d40496.jpeg","Comparison of LLMs with Children's Defense Mechanisms","Defense, in general, is a psychological operation that removes some component(s) of unpleasant emotions—thoughts, feelings, or both—beyond conscious awareness.",{"id":337,"publish_date":338,"is_original":23,"collection":339,"cover_url":340,"cover_url_1_1":341,"title":342,"summary":343,"author":28},212,"2024-07-23","#AI Virtual Try-On #Tencent","article_res/cover/4b843969ae2bdf19e55f75c824be7d4f.jpeg","article_res/cover/762ea863dd1d1fa32990288b476313c5.jpeg","IMAGDressing","IMAGDressing: Interactive Modular Apparel Generation for Virtual Dressing",{"id":207,"publish_date":208,"is_original":4,"collection":209,"cover_url":210,"cover_url_1_1":211,"title":212,"summary":213,"author":28},[346,352,358],{"title":10,"list":347},[348,349,350,351],{"id":96,"publish_date":97,"is_original":23,"collection":98,"cover_url":99,"cover_url_1_1":100,"title":101,"summary":102,"author":28},{"id":104,"publish_date":105,"is_original":23,"collection":106,"cover_url":107,"cover_url_1_1":108,"title":109,"summary":110,"author":28},{"id":112,"publish_date":113,"is_original":23,"collection":114,"cover_url":115,"cover_url_1_1":116,"title":117,"summary":118,"author":28},{"id":166,"publish_date":167,"is_original":23,"collection":168,"cover_url":169,"cover_url_1_1":170,"title":171,"summary":172,"author":28},{"title":222,"list":353},[354,355,356,357],{"id":120,"publish_date":113,"is_original":23,"collection":121,"cover_url":122,"cover_url_1_1":123,"title":124,"summary":125,"author":28},{"id":166,"publish_date":167,"is_original":23,"collection":168,"cover_url":169,"cover_url_1_1":170,"title":171,"summary":172,"author":28},{"id":227,"publish_date":228,"is_original":23,"collection":229,"cover_url":230,"cover_url_1_1":231,"title":232,"summary":233,"author":28},{"id":235,"publish_date":236,"is_original":23,"collection":73,"cover_url":237,"cover_url_1_1":238,"title":239,"summary":240,"author":28},{"title":242,"list":359},[],[8,9,10],[8,12,13,14,9,10,15,16,17,18],["Reactive",245],1754646409443]