diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 0000000..c626b00 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,13 @@ +# These are supported funding model platforms + +github: [kyegomez] +patreon: # Replace with a single Patreon username +open_collective: # Replace with a single Open Collective username +ko_fi: # Replace with a single Ko-fi username +tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel +community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry +liberapay: # Replace with a single Liberapay username +issuehunt: # Replace with a single IssueHunt username +otechie: # Replace with a single Otechie username +lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry +custom: #Nothing diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..d43f812 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,27 @@ +--- +name: Bug report +about: Create a detailed report on the bug and it's root cause. Conduct root cause error analysis +title: "[BUG] " +labels: bug +assignees: kyegomez + +--- + +**Describe the bug** +A clear and concise description of what the bug is and what the main root cause error is. Test very thoroughly before submitting. + +**To Reproduce** +Steps to reproduce the behavior: +1. Go to '...' +2. Click on '....' +3. Scroll down to '....' +4. See error + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Screenshots** +If applicable, add screenshots to help explain your problem. + +**Additional context** +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..806abd7 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: '' +assignees: 'kyegomez' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. diff --git a/.github/PULL_REQUEST_TEMPLATE.yml b/.github/PULL_REQUEST_TEMPLATE.yml new file mode 100644 index 0000000..8e03012 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.yml @@ -0,0 +1,22 @@ + + +Zeta provides you with reliable, high performance, and fast modular building blocks for building zeta scale neural nets at lightspeed with minimal code and a pythonic API. + +[Click here for Zeta Documentation โ†’](zeta/) + + +## Examples + +Check out Zeta examples for building agents, data retrieval, and more. + +[Checkout Zeta examples โ†’](examples/) diff --git a/docs/metric.md b/docs/metric.md new file mode 100644 index 0000000..a223edc --- /dev/null +++ b/docs/metric.md @@ -0,0 +1,4 @@ +# The Golden Metric: + +* We need to figure out a single metric that determines if we're accomplishing our goal with zeta which is to build zetascale superintelligent AI models as fast as possible with minimal code. + diff --git a/docs/overrides/main.html b/docs/overrides/main.html new file mode 100644 index 0000000..8dbe669 --- /dev/null +++ b/docs/overrides/main.html @@ -0,0 +1,9 @@ +{% extends "base.html" %} + + + +{% block announce %} +
+ Star and contribute to Zeta on GitHub! +
+{% endblock %} \ No newline at end of file diff --git a/docs/purpose.md b/docs/purpose.md new file mode 100644 index 0000000..d0ed588 --- /dev/null +++ b/docs/purpose.md @@ -0,0 +1,47 @@ +# Zeta's Purpose + + +Eevery once in a while, a revolutionary project comes along that changes everything. + +A landscape cluttered by rigid frameworks, plagued by inefficiencies, and where developers - our brightest minds - are bogged down by limitations. + +Now, imagine a world where harnessing the power of state-of-the-art models isn't just possible... it's simple. A world where efficiency doesnโ€™t sacrifice safety, and where your ideas are bounded only by your imagination. We should be living in this world. But we aren't. + + +Because Zeta is what's missing. + + +The challenge? Creating a framework that's not just another tool, but a revolution. + +To bridge this gap, one would need to optimize at the foundational level, prioritize user experience, and introduce a design philosophy that future-proofs. It's colossal. And until now, no one's even come close. + + +But thereโ€™s an enormous opportunity here. An opportunity that promises not just recognition but the power to redefine an industry. And, the key to unlocking this future? It's been with us all along. + + +Insight. + + +Introducing... Zeta. + + +Our secret? Fluidity. + +Itโ€™s a philosophy that values modularity, reliability, usability, and unmatched speed. + +But more than that, it's a commitment to evolution, to pushing boundaries, to never settling. + + +Why are we the best to execute this vision? + +Because we've been there from the start. + +We've seen the challenges, felt the frustrations, and now, we're poised to lead the revolution. + +Weโ€™ve done it before, and with Zeta, weโ€™re doing it again. + + +Zeta isnโ€™t just the next step. It's a leap into the future. + +Zeta is the future of AI. + diff --git a/docs/research.md b/docs/research.md new file mode 100644 index 0000000..83fd262 --- /dev/null +++ b/docs/research.md @@ -0,0 +1,1103 @@ +# Awesome Multimodal Machine Learning + +By [Paul Liang](http://www.cs.cmu.edu/~pliang/) (pliang@cs.cmu.edu), [Machine Learning Department](http://www.ml.cmu.edu/) and [Language Technologies Institute](https://www.lti.cs.cmu.edu/), [CMU](https://www.cmu.edu/), with help from members of the [MultiComp Lab](http://multicomp.cs.cmu.edu/) at LTI, CMU. If there are any areas, papers, and datasets I missed, please let me know! + +## Course content + workshops + +Check out our comprehsensive tutorial paper [Foundations and Recent Trends in Multimodal Machine Learning: Principles, Challenges, and Open Questions](https://arxiv.org/abs/2209.03430). + +[Tutorials on Multimodal Machine Learning](https://cmu-multicomp-lab.github.io/mmml-tutorial/cvpr2022/) at CVPR 2022 and NAACL 2022, slides and videos [here](https://cmu-multicomp-lab.github.io/mmml-tutorial/schedule/). + +New course [11-877 Advanced Topics in Multimodal Machine Learning](https://cmu-multicomp-lab.github.io/adv-mmml-course/spring2022/) Spring 2022 @ CMU. It will primarily be reading and discussion-based. We plan to post discussion probes, relevant papers, and summarized discussion highlights every week on the website. + +Public course content and lecture videos from [11-777 Multimodal Machine Learning](https://cmu-multicomp-lab.github.io/mmml-course/fall2020/), Fall 2020 @ CMU. + +## Table of Contents + +* [Survey Papers](#survey-papers) +* [Core Areas](#core-areas) + * [Multimodal Representations](#multimodal-representations) + * [Multimodal Fusion](#multimodal-fusion) + * [Multimodal Alignment](#multimodal-alignment) + * [Multimodal Pretraining](#multimodal-pretraining) + * [Multimodal Translation](#multimodal-translation) + * [Crossmodal Retrieval](#crossmodal-retrieval) + * [Multimodal Co-learning](#multimodal-colearning) + * [Missing or Imperfect Modalities](#missing-or-imperfect-modalities) + * [Analysis of Multimodal Models](#analysis-of-multimodal-models) + * [Knowledge Graphs and Knowledge Bases](#knowledge-graphs-and-knowledge-bases) + * [Intepretable Learning](#intepretable-learning) + * [Generative Learning](#generative-learning) + * [Semi-supervised Learning](#semi-supervised-learning) + * [Self-supervised Learning](#self-supervised-learning) + * [Language Models](#language-models) + * [Adversarial Attacks](#adversarial-attacks) + * [Few-Shot Learning](#few-shot-learning) + * [Bias and Fairness](#bias-and-fairness) + * [Human in the Loop Learning](#human-in-the-loop-learning) +* [Architectures](#architectures) + * [Multimodal Transformers](#multimodal-transformers) + * [Multimodal Memory](#multimodal-memory) +* [Applications and Datasets](#applications-and-datasets) + * [Language and Visual QA](#language-and-visual-qa) + * [Language Grounding in Vision](#language-grounding-in-vision) + * [Language Grouding in Navigation](#language-grouding-in-navigation) + * [Multimodal Machine Translation](#multimodal-machine-translation) + * [Multi-agent Communication](#multi-agent-communication) + * [Commonsense Reasoning](#commonsense-reasoning) + * [Multimodal Reinforcement Learning](#multimodal-reinforcement-learning) + * [Multimodal Dialog](#multimodal-dialog) + * [Language and Audio](#language-and-audio) + * [Audio and Visual](#audio-and-visual) + * [Visual, IMU and Wireless](#visual-imu-and-wireless) + * [Media Description](#media-description) + * [Video Generation from Text](#video-generation-from-text) + * [Affect Recognition and Multimodal Language](#affect-recognition-and-multimodal-language) + * [Healthcare](#healthcare) + * [Robotics](#robotics) + * [Autonomous Driving](#Autonomous-Driving) + * [Finance](#Finance) + * [Human AI Interaction](#Human-AI-Interaction) +* [Workshops](#workshops) +* [Tutorials](#tutorials) +* [Courses](#courses) + + +# Research Papers + +## Survey Papers + +[Foundations and Trends in Multimodal Machine Learning: Principles, Challenges, and Open Questions](https://arxiv.org/abs/2209.03430), arxiv 2023 + +[Multimodal Learning with Transformers: A Survey](https://arxiv.org/abs/2206.06488), TPAMI 2023 + +[Trends in Integration of Vision and Language Research: A Survey of Tasks, Datasets, and Methods](https://doi.org/10.1613/jair.1.11688), JAIR 2021 + +[Experience Grounds Language](https://arxiv.org/abs/2004.10151), EMNLP 2020 + +[A Survey of Reinforcement Learning Informed by Natural Language](https://arxiv.org/abs/1906.03926), IJCAI 2019 + +[Multimodal Machine Learning: A Survey and Taxonomy](https://arxiv.org/abs/1705.09406), TPAMI 2019 + +[Multimodal Intelligence: Representation Learning, Information Fusion, and Applications](https://arxiv.org/abs/1911.03977), arXiv 2019 + +[Deep Multimodal Representation Learning: A Survey](https://ieeexplore.ieee.org/abstract/document/8715409), arXiv 2019 + +[Guest Editorial: Image and Language Understanding](https://link.springer.com/article/10.1007/s11263-017-0993-y), IJCV 2017 + +[Representation Learning: A Review and New Perspectives](https://arxiv.org/abs/1206.5538), TPAMI 2013 + +[A Survey of Socially Interactive Robots](https://www.cs.cmu.edu/~illah/PAPERS/socialroboticssurvey.pdf), 2003 + +## Core Areas + +### Multimodal Representations + +[Identifiability Results for Multimodal Contrastive Learning](https://arxiv.org/abs/2303.09166), ICLR 2023 [[code]](https://github.com/imantdaunhawer/multimodal-contrastive-learning) + +[Unpaired Vision-Language Pre-training via Cross-Modal CutMix](https://arxiv.org/abs/2206.08919), ICML 2022. + +[Balanced Multimodal Learning via On-the-fly Gradient Modulation](https://arxiv.org/abs/2203.15332), CVPR 2022 + +[Unsupervised Voice-Face Representation Learning by Cross-Modal Prototype Contrast](https://arxiv.org/abs/2204.14057), IJCAI 2021 [[code]](https://github.com/Cocoxili/CMPC) + +[Towards a Unified Foundation Model: Jointly Pre-Training Transformers on Unpaired Images and Text](https://arxiv.org/abs/2112.07074), arXiv 2021 + +[FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482), arXiv 2021 + +[Transformer is All You Need: Multimodal Multitask Learning with a Unified Transformer](https://arxiv.org/abs/2102.10772), arXiv 2021 + +[MultiBench: Multiscale Benchmarks for Multimodal Representation Learning](https://arxiv.org/abs/2107.07502), NeurIPS 2021 [[code]](https://github.com/pliang279/MultiBench) + +[Perceiver: General Perception with Iterative Attention](https://arxiv.org/abs/2103.03206), ICML 2021 [[code]](https://github.com/deepmind/deepmind-research/tree/master/perceiver) + +[Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020), arXiv 2021 [[blog]]([blog](https://openai.com/blog/clip/)) [[code]](https://github.com/OpenAI/CLIP) + +[VinVL: Revisiting Visual Representations in Vision-Language Models](https://arxiv.org/abs/2101.00529), arXiv 2021 [[blog]](https://www.microsoft.com/en-us/research/blog/vinvl-advancing-the-state-of-the-art-for-vision-language-models/?OCID=msr_blog_VinVL_fb) [[code]](https://github.com/pzzhang/VinVL) + +[Learning Transferable Visual Models From Natural Language Supervision](https://cdn.openai.com/papers/Learning_Transferable_Visual_Models_From_Natural_Language.pdf), arXiv 2020 [[blog]](https://openai.com/blog/clip/) [[code]](https://github.com/openai/CLIP) + +[12-in-1: Multi-Task Vision and Language Representation Learning](https://arxiv.org/abs/1912.02315), CVPR 2020 [[code]](https://github.com/facebookresearch/vilbert-multi-task) + +[Watching the World Go By: Representation Learning from Unlabeled Videos](https://arxiv.org/abs/2003.07990), arXiv 2020 + +[Learning Video Representations using Contrastive Bidirectional Transformer](https://arxiv.org/abs/1906.05743), arXiv 2019 + +[Visual Concept-Metaconcept Learning](https://papers.nips.cc/paper/8745-visual-concept-metaconcept-learning.pdf), NeurIPS 2019 [[code]](http://vcml.csail.mit.edu/) + +[OmniNet: A Unified Architecture for Multi-modal Multi-task Learning](https://arxiv.org/abs/1907.07804), arXiv 2019 [[code]](https://github.com/subho406/OmniNet) + +[Learning Representations by Maximizing Mutual Information Across Views](https://arxiv.org/abs/1906.00910), arXiv 2019 [[code]](https://github.com/Philip-Bachman/amdim-public) + +[ViCo: Word Embeddings from Visual Co-occurrences](https://arxiv.org/abs/1908.08527), ICCV 2019 [[code]](https://github.com/BigRedT/vico) + +[Unified Visual-Semantic Embeddings: Bridging Vision and Language With Structured Meaning Representations](http://openaccess.thecvf.com/content_CVPR_2019/papers/Wu_Unified_Visual-Semantic_Embeddings_Bridging_Vision_and_Language_With_Structured_Meaning_CVPR_2019_paper.pdf), CVPR 2019 + +[Multi-Task Learning of Hierarchical Vision-Language Representation](https://arxiv.org/abs/1812.00500), CVPR 2019 + +[Learning Factorized Multimodal Representations](https://arxiv.org/abs/1806.06176), ICLR 2019 [[code]](https://github.com/pliang279/factorized/) + +[A Probabilistic Framework for Multi-view Feature Learning with Many-to-many Associations via Neural Networks](https://arxiv.org/abs/1802.04630), ICML 2018 + +[Do Neural Network Cross-Modal Mappings Really Bridge Modalities?](https://aclweb.org/anthology/P18-2074), ACL 2018 + +[Learning Robust Visual-Semantic Embeddings](https://arxiv.org/abs/1703.05908), ICCV 2017 + +[Deep Multimodal Representation Learning from Temporal Data](https://arxiv.org/abs/1704.03152), CVPR 2017 + +[Is an Image Worth More than a Thousand Words? On the Fine-Grain Semantic Differences between Visual and Linguistic Representations](https://www.aclweb.org/anthology/C16-1264), COLING 2016 + +[Combining Language and Vision with a Multimodal Skip-gram Model](https://www.aclweb.org/anthology/N15-1016), NAACL 2015 + +[Deep Fragment Embeddings for Bidirectional Image Sentence Mapping](https://arxiv.org/abs/1406.5679), NIPS 2014 + +[Multimodal Learning with Deep Boltzmann Machines](https://dl.acm.org/citation.cfm?id=2697059), JMLR 2014 + +[Learning Grounded Meaning Representations with Autoencoders](https://www.aclweb.org/anthology/P14-1068), ACL 2014 + +[DeViSE: A Deep Visual-Semantic Embedding Model](https://papers.nips.cc/paper/5204-devise-a-deep-visual-semantic-embedding-model), NeurIPS 2013 + +[Multimodal Deep Learning](https://dl.acm.org/citation.cfm?id=3104569), ICML 2011 + +### Multimodal Fusion + +[Robust Contrastive Learning against Noisy Views](https://arxiv.org/abs/2201.04309), arXiv 2022 + +[Cooperative Learning for Multi-view Analysis](https://arxiv.org/abs/2112.12337), arXiv 2022 + +[What Makes Multi-modal Learning Better than Single (Provably)](https://arxiv.org/abs/2106.04538), NeurIPS 2021 + +[Efficient Multi-Modal Fusion with Diversity Analysis](https://dl.acm.org/doi/abs/10.1145/3474085.3475188), ACMMM 2021 + +[Attention Bottlenecks for Multimodal Fusion](https://arxiv.org/abs/2107.00135), NeurIPS 2021 + +[VMLoc: Variational Fusion For Learning-Based Multimodal Camera Localization](https://arxiv.org/abs/2003.07289), AAAI 2021 + +[Trusted Multi-View Classification](https://openreview.net/forum?id=OOsR8BzCnl5), ICLR 2021 [[code]](https://github.com/hanmenghan/TMC) + +[Deep-HOSeq: Deep Higher-Order Sequence Fusion for Multimodal Sentiment Analysis](https://arxiv.org/pdf/2010.08218.pdf), ICDM 2020 + +[Removing Bias in Multi-modal Classifiers: Regularization by Maximizing Functional Entropies](https://arxiv.org/abs/2010.10802), NeurIPS 2020 [[code]](https://github.com/itaigat/removing-bias-in-multi-modal-classifiers) + +[Deep Multimodal Fusion by Channel Exchanging](https://arxiv.org/abs/2011.05005?context=cs.LG), NeurIPS 2020 [[code]](https://github.com/yikaiw/CEN) + +[What Makes Training Multi-Modal Classification Networks Hard?](https://arxiv.org/abs/1905.12681), CVPR 2020 + +[Dynamic Fusion for Multimodal Data](https://arxiv.org/abs/1911.03821), arXiv 2019 + +[DeepCU: Integrating Both Common and Unique Latent Information for Multimodal Sentiment Analysis](https://www.ijcai.org/proceedings/2019/503), IJCAI 2019 [[code]](https://github.com/sverma88/DeepCU-IJCAI19) + +[Deep Multimodal Multilinear Fusion with High-order Polynomial Pooling](https://papers.nips.cc/paper/9381-deep-multimodal-multilinear-fusion-with-high-order-polynomial-pooling), NeurIPS 2019 + +[XFlow: Cross-modal Deep Neural Networks for Audiovisual Classification](https://ieeexplore.ieee.org/abstract/document/8894404), IEEE TNNLS 2019 [[code]](https://github.com/catalina17/XFlow) + +[MFAS: Multimodal Fusion Architecture Search](https://arxiv.org/abs/1903.06496), CVPR 2019 + +[The Neuro-Symbolic Concept Learner: Interpreting Scenes, Words, and Sentences From Natural Supervision](https://arxiv.org/abs/1904.12584), ICLR 2019 [[code]](http://nscl.csail.mit.edu/) + +[Unifying and merging well-trained deep neural networks for inference stage](https://www.ijcai.org/Proceedings/2018/0283.pdf), IJCAI 2018 [[code]](https://github.com/ivclab/NeuralMerger) + +[Efficient Low-rank Multimodal Fusion with Modality-Specific Factors](https://arxiv.org/abs/1806.00064), ACL 2018 [[code]](https://github.com/Justin1904/Low-rank-Multimodal-Fusion) + +[Memory Fusion Network for Multi-view Sequential Learning](https://www.aaai.org/ocs/index.php/AAAI/AAAI18/paper/viewFile/17341/16122), AAAI 2018 [[code]](https://github.com/pliang279/MFN) + +[Tensor Fusion Network for Multimodal Sentiment Analysis](https://arxiv.org/abs/1707.07250), EMNLP 2017 [[code]](https://github.com/A2Zadeh/TensorFusionNetwork) + +[Jointly Modeling Deep Video and Compositional Text to Bridge Vision and Language in a Unified Framework](http://web.eecs.umich.edu/~jjcorso/pubs/xu_corso_AAAI2015_v2t.pdf), AAAI 2015 + +[A co-regularized approach to semi-supervised learning with multiple views](https://web.cse.ohio-state.edu/~belkin.8/papers/CASSL_ICML_05.pdf), ICML 2005 + +### Multimodal Alignment + +[Reconsidering Representation Alignment for Multi-view Clustering](https://openaccess.thecvf.com/content/CVPR2021/html/Trosten_Reconsidering_Representation_Alignment_for_Multi-View_Clustering_CVPR_2021_paper.html), CVPR 2021 [[code]](https://github.com/DanielTrosten/mvc) + +[CoMIR: Contrastive Multimodal Image Representation for Registration](https://arxiv.org/pdf/2006.06325.pdf), NeurIPS 2020 [[code]](https://github.com/MIDA-group/CoMIR) + +[Multimodal Transformer for Unaligned Multimodal Language Sequences](https://arxiv.org/abs/1906.00295), ACL 2019 [[code]](https://github.com/yaohungt/Multimodal-Transformer) + +[Temporal Cycle-Consistency Learning](https://arxiv.org/abs/1904.07846), CVPR 2019 [[code]](https://github.com/google-research/google-research/tree/master/tcc) + +[See, Hear, and Read: Deep Aligned Representations](https://people.csail.mit.edu/yusuf/see-hear-read/paper.pdf), arXiv 2017 + +[On Deep Multi-View Representation Learning](http://proceedings.mlr.press/v37/wangb15.pdf), ICML 2015 + +[Unsupervised Alignment of Natural Language Instructions with Video Segments](https://dl.acm.org/citation.cfm?id=2892753.2892769), AAAI 2014 + +[Multimodal Alignment of Videos](https://dl.acm.org/citation.cfm?id=2654862), MM 2014 + +[Deep Canonical Correlation Analysis](http://proceedings.mlr.press/v28/andrew13.html), ICML 2013 [[code]](https://github.com/VahidooX/DeepCCA) + +### Multimodal Pretraining +[Align before Fuse: Vision and Language Representation Learning with Momentum Distillation](https://arxiv.org/abs/2107.07651), NeurIPS 2021 Spotlight [[code]](https://github.com/salesforce/ALBEF) + +[Less is More: ClipBERT for Video-and-Language Learning via Sparse Sampling](https://arxiv.org/abs/2102.06183), CVPR 2021 [[code]](https://github.com/jayleicn/ClipBERT) + +[Transformer is All You Need: Multimodal Multitask Learning with a Unified Transformer](https://arxiv.org/abs/2102.10772), arXiv 2021 + +[Large-Scale Adversarial Training for Vision-and-Language Representation Learning](https://arxiv.org/abs/2006.06195), NeurIPS 2020 [[code]](https://github.com/zhegan27/VILLA) + +[Vokenization: Improving Language Understanding with Contextualized, Visual-Grounded Supervision](https://arxiv.org/abs/2010.06775), EMNLP 2020 [[code]](https://github.com/airsplay/vokenization) + +[Integrating Multimodal Information in Large Pretrained Transformers](https://arxiv.org/abs/1908.05787), ACL 2020 + +[VL-BERT: Pre-training of Generic Visual-Linguistic Representations](https://arxiv.org/abs/1908.08530), arXiv 2019 [[code]](https://github.com/jackroos/VL-BERT) + +[VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/abs/1908.03557), arXiv 2019 [[code]](https://github.com/uclanlp/visualbert) + +[ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks](https://arxiv.org/abs/1908.02265), NeurIPS 2019 [[code]](https://github.com/jiasenlu/vilbert_beta) + +[Unicoder-VL: A Universal Encoder for Vision and Language by Cross-modal Pre-training](https://arxiv.org/abs/1908.06066), arXiv 2019 + +[LXMERT: Learning Cross-Modality Encoder Representations from Transformers](https://arxiv.org/abs/1908.07490), EMNLP 2019 [[code]](https://github.com/airsplay/lxmert) + +[VideoBERT: A Joint Model for Video and Language Representation Learning](https://arxiv.org/abs/1904.01766), ICCV 2019 + +### Multimodal Translation + +[Zero-Shot Text-to-Image Generation](https://arxiv.org/abs/2102.12092), ICML 2021 [[code]](https://github.com/openai/DALL-E) + +[Translate-to-Recognize Networks for RGB-D Scene Recognition](https://openaccess.thecvf.com/content_CVPR_2019/papers/Du_Translate-to-Recognize_Networks_for_RGB-D_Scene_Recognition_CVPR_2019_paper.pdf), CVPR 2019 [[code]](https://github.com/ownstyledu/Translate-to-Recognize-Networks) + +[Language2Pose: Natural Language Grounded Pose Forecasting](https://arxiv.org/abs/1907.01108), 3DV 2019 [[code]](http://chahuja.com/language2pose/) + +[Reconstructing Faces from Voices](https://arxiv.org/abs/1905.10604), NeurIPS 2019 [[code]](https://github.com/cmu-mlsp/reconstructing_faces_from_voices) + +[Speech2Face: Learning the Face Behind a Voice](https://arxiv.org/abs/1905.09773), CVPR 2019 [[code]](https://speech2face.github.io/) + +[Found in Translation: Learning Robust Joint Representations by Cyclic Translations Between Modalities](https://arxiv.org/abs/1812.07809), AAAI 2019 [[code]](https://github.com/hainow/MCTN) + +[Natural TTS Synthesis by Conditioning Wavenet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884), ICASSP 2018 [[code]](https://github.com/NVIDIA/tacotron2) + +### Crossmodal Retrieval + +[Learning with Noisy Correspondence for Cross-modal Matching](https://proceedings.neurips.cc/paper/2021/file/f5e62af885293cf4d511ceef31e61c80-Paper.pdf), NeurIPS 2021 [[code]](https://github.com/XLearning-SCU/2021-NeurIPS-NCR) + +[MURAL: Multimodal, Multitask Retrieval Across Languages](https://arxiv.org/abs/2109.05125), arXiv 2021 + +[Self-Supervised Learning from Web Data for Multimodal Retrieval](https://arxiv.org/abs/1901.02004), arXiv 2019 + +[Look, Imagine and Match: Improving Textual-Visual Cross-Modal Retrieval with Generative Models](https://arxiv.org/abs/1711.06420), CVPR 2018 + +[Scene-centric vs. Object-centric Image-Text Cross-modal Retrieval: A Reproducibility Study](https://arxiv.org/abs/2301.05174), ECIR 2023 + +### Multimodal Co-learning + +[Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918), ICML 2021 + +[Multimodal Co-learning: Challenges, Applications with Datasets, Recent Advances and Future Directions](https://arxiv.org/abs/2107.13782), arXiv 2021 + +[Vokenization: Improving Language Understanding via Contextualized, Visually-Grounded Supervision](https://arxiv.org/abs/2010.06775), EMNLP 2020 + +[Foundations of Multimodal Co-learning](https://www.sciencedirect.com/science/article/pii/S1566253520303006), Information Fusion 2020 + +### Missing or Imperfect Modalities + +[A Variational Information Bottleneck Approach to Multi-Omics Data Integration](https://arxiv.org/abs/2102.03014), AISTATS 2021 [[code]](https://github.com/chl8856/DeepIMV) + +[SMIL: Multimodal Learning with Severely Missing Modality](https://arxiv.org/abs/2103.05677), AAAI 2021 + +[Factorized Inference in Deep Markov Models for Incomplete Multimodal Time Series](https://arxiv.org/abs/1905.13570), arXiv 2019 + +[Learning Representations from Imperfect Time Series Data via Tensor Rank Regularization](https://arxiv.org/abs/1907.01011), ACL 2019 + +[Multimodal Deep Learning for Robust RGB-D Object Recognition](https://arxiv.org/abs/1507.06821), IROS 2015 + +### Analysis of Multimodal Models + +[M2Lens: Visualizing and Explaining Multimodal Models for Sentiment Analysis](https://arxiv.org/abs/2107.08264), IEEE TVCG 2022 + +[Decoupling the Role of Data, Attention, and Losses in Multimodal Transformers](https://arxiv.org/abs/2102.00529), TACL 2021 + +[Does my multimodal model learn cross-modal interactions? Itโ€™s harder to tell than you might think!](https://www.aclweb.org/anthology/2020.emnlp-main.62.pdf), EMNLP 2020 + +[Blindfold Baselines for Embodied QA](https://arxiv.org/abs/1811.05013), NIPS 2018 Visually-Grounded Interaction and Language Workshop + +[Analyzing the Behavior of Visual Question Answering Models](https://arxiv.org/abs/1606.07356), EMNLP 2016 + +### Knowledge Graphs and Knowledge Bases + +[MMKG: Multi-Modal Knowledge Graphs](https://arxiv.org/abs/1903.05485), ESWC 2019 + +[Answering Visual-Relational Queries in Web-Extracted Knowledge Graphs](https://arxiv.org/abs/1709.02314), AKBC 2019 + +[Embedding Multimodal Relational Data for Knowledge Base Completion](https://arxiv.org/abs/1809.01341), EMNLP 2018 + +[A Multimodal Translation-Based Approach for Knowledge Graph Representation Learning](https://www.aclweb.org/anthology/S18-2027), SEM 2018 [[code]](https://github.com/UKPLab/starsem18-multimodalKB) + +[Order-Embeddings of Images and Language](https://arxiv.org/abs/1511.06361), ICLR 2016 [[code]](https://github.com/ivendrov/order-embedding) + +[Building a Large-scale Multimodal Knowledge Base System for Answering Visual Queries](https://arxiv.org/abs/1507.05670), arXiv 2015 + +### Intepretable Learning + +[Multimodal Explanations by Predicting Counterfactuality in Videos](https://arxiv.org/abs/1812.01263), CVPR 2019 + +[Multimodal Explanations: Justifying Decisions and Pointing to the Evidence](https://arxiv.org/abs/1802.08129), CVPR 2018 [[code]](https://github.com/Seth-Park/MultimodalExplanations) + +[Do Explanations make VQA Models more Predictable to a Human?](https://arxiv.org/abs/1810.12366), EMNLP 2018 + +[Towards Transparent AI Systems: Interpreting Visual Question Answering Models](https://arxiv.org/abs/1608.08974), ICML Workshop on Visualization for Deep Learning 2016 + +### Generative Learning + +[MMVAE+: Enhancing the Generative Quality of Multimodal VAEs without Compromises](https://openreview.net/forum?id=sdQGxouELX), ICLR 2023 [[code]](https://github.com/epalu/mmvaeplus) + +[On the Limitations of Multimodal VAEs](https://arxiv.org/abs/2110.04121), ICLR 2022 [[code]](https://openreview.net/attachment?id=w-CPUXXrAj&name=supplementary_material) + +[Generalized Multimodal ELBO](https://openreview.net/forum?id=5Y21V0RDBV), ICLR 2021 [[code]](https://github.com/thomassutter/MoPoE) + +[Multimodal Generative Learning Utilizing Jensen-Shannon-Divergence](https://arxiv.org/abs/2006.08242), NeurIPS 2020 [[code]](https://github.com/thomassutter/mmjsd) + +[Self-supervised Disentanglement of Modality-specific and Shared Factors Improves Multimodal Generative Models](https://rdcu.be/c8WUU), GCPR 2020 [[code]](https://github.com/imantdaunhawer/DMVAE) + +[Variational Mixture-of-Experts Autoencodersfor Multi-Modal Deep Generative Models](https://arxiv.org/pdf/1911.03393.pdf), NeurIPS 2019 [[code]](https://github.com/iffsid/mmvae) + +[Few-shot Video-to-Video Synthesis](https://arxiv.org/abs/1910.12713), NeurIPS 2019 [[code]](https://nvlabs.github.io/few-shot-vid2vid/) + +[Multimodal Generative Models for Scalable Weakly-Supervised Learning](https://arxiv.org/abs/1802.05335), NeurIPS 2018 [[code1]](https://github.com/mhw32/multimodal-vae-public) [[code2]](https://github.com/panpan2/Multimodal-Variational-Autoencoder) + +[The Multi-Entity Variational Autoencoder](http://charlienash.github.io/assets/docs/mevae2017.pdf), NeurIPS 2017 + +### Semi-supervised Learning + +[Semi-supervised Vision-language Mapping via Variational Learning](https://ieeexplore.ieee.org/document/7989160), ICRA 2017 + +[Semi-supervised Multimodal Hashing](https://arxiv.org/abs/1712.03404), arXiv 2017 + +[Semi-Supervised Multimodal Deep Learning for RGB-D Object Recognition](https://www.ijcai.org/Proceedings/16/Papers/473.pdf), IJCAI 2016 + +[Multimodal Semi-supervised Learning for Image Classification](https://ieeexplore.ieee.org/abstract/document/5540120), CVPR 2010 + +### Self-supervised Learning + +[DABS: A Domain-Agnostic Benchmark for Self-Supervised Learning](https://arxiv.org/abs/2111.12062), NeurIPS 2021 Datasets & Benchmarks Track [[code]](https://github.com/alextamkin/dabs) + +[Self-Supervised Learning by Cross-Modal Audio-Video Clustering](https://arxiv.org/abs/1911.12667), NeurIPS 2020 [[code]](https://github.com/HumamAlwassel/XDC) + +[Self-Supervised MultiModal Versatile Networks](https://arxiv.org/abs/2006.16228), NeurIPS 2020 [[code]](https://tfhub.dev/deepmind/mmv/s3d/1) + +[Labelling Unlabelled Videos from Scratch with Multi-modal Self-supervision](https://arxiv.org/abs/2006.13662), NeurIPS 2020 [[code]](https://www.robots.ox.ac.uk/~vgg/research/selavi/) + +[Self-Supervised Learning of Visual Features through Embedding Images into Text Topic Spaces](https://ieeexplore.ieee.org/document/8099701), CVPR 2017 + +[Multimodal Dynamics : Self-supervised Learning in Perceptual and Motor Systems](https://dl.acm.org/citation.cfm?id=1269207), 2016 + +### Language Models + +[Neural Language Modeling with Visual Features](https://arxiv.org/abs/1903.02930), arXiv 2019 + +[Learning Multi-Modal Word Representation Grounded in Visual Context](https://arxiv.org/abs/1711.03483), AAAI 2018 + +[Visual Word2Vec (vis-w2v): Learning Visually Grounded Word Embeddings Using Abstract Scenes](https://arxiv.org/abs/1511.07067), CVPR 2016 + +[Unifying Visual-Semantic Embeddings with Multimodal Neural Language Models](http://proceedings.mlr.press/v32/kiros14.html), ICML 2014 [[code]](https://github.com/ryankiros/visual-semantic-embedding) + +### Adversarial Attacks + +[Attend and Attack: Attention Guided Adversarial Attacks on Visual Question Answering Models](https://nips2018vigil.github.io/static/papers/accepted/33.pdf), NeurIPS Workshop on Visually Grounded Interaction and Language 2018 + +[Attacking Visual Language Grounding with Adversarial Examples: A Case Study on Neural Image Captioning](https://arxiv.org/abs/1712.02051), ACL 2018 [[code]](https://github.com/huanzhang12/ImageCaptioningAttack) + +[Fooling Vision and Language Models Despite Localization and Attention Mechanism](https://arxiv.org/abs/1709.08693), CVPR 2018 + +### Few-Shot Learning + +[Language to Network: Conditional Parameter Adaptation with Natural Language Descriptions](https://www.aclweb.org/anthology/2020.acl-main.625/), ACL 2020 + +[Shaping Visual Representations with Language for Few-shot Classification](https://arxiv.org/abs/1911.02683), ACL 2020 + +[Zero-Shot Learning - The Good, the Bad and the Ugly](https://arxiv.org/abs/1703.04394), CVPR 2017 + +[Zero-Shot Learning Through Cross-Modal Transfer](https://nlp.stanford.edu/~socherr/SocherGanjooManningNg_NIPS2013.pdf), NIPS 2013 + +### Bias and Fairness + +[Worst of Both Worlds: Biases Compound in Pre-trained Vision-and-Language Models](https://arxiv.org/abs/2104.08666), arXiv 2021 + +[Towards Debiasing Sentence Representations](https://arxiv.org/abs/2007.08100), ACL 2020 [[code]](https://github.com/pliang279/sent_debias) + +[FairCVtest Demo: Understanding Bias in Multimodal Learning with a Testbed in Fair Automatic Recruitment](https://arxiv.org/abs/2009.07025), ICMI 2020 [[code]](https://github.com/BiDAlab/FairCVtest) + +[Model Cards for Model Reporting](https://arxiv.org/abs/1810.03993), FAccT 2019 + +[Black is to Criminal as Caucasian is to Police: Detecting and Removing Multiclass Bias in Word Embeddings](https://arxiv.org/abs/1904.04047), NAACL 2019 [[code]](https://github.com/TManzini/DebiasMulticlassWordEmbedding) + +[Gender Shades: Intersectional Accuracy Disparities in Commercial Gender Classification](http://proceedings.mlr.press/v81/buolamwini18a.html?mod=article_inline), FAccT 2018 + +[Datasheets for Datasets](https://arxiv.org/abs/1803.09010), arXiv 2018 + +[Man is to Computer Programmer as Woman is to Homemaker? Debiasing Word Embeddings](https://arxiv.org/abs/1607.06520), NeurIPS 2016 + +### Human in the Loop Learning + +[Human in the Loop Dialogue Systems](https://sites.google.com/view/hlds-2020/home), NeurIPS 2020 workshop + +[Human And Machine in-the-Loop Evaluation and Learning Strategies](https://hamlets-workshop.github.io/), NeurIPS 2020 workshop + +[Human-centric dialog training via offline reinforcement learning](https://arxiv.org/abs/2010.05848), EMNLP 2020 [[code]](https://github.com/natashamjaques/neural_chat/tree/master/BatchRL) + +[Human-In-The-Loop Machine Learning with Intelligent Multimodal Interfaces](https://csjzhou.github.io/homepage/papers/ICML2017_Syed.pdf), ICML 2017 workshop + +## Architectures + +### Multimodal Transformers + +[Pretrained Transformers As Universal Computation Engines](https://arxiv.org/abs/2103.05247), AAAI 2022 + +[Perceiver: General Perception with Iterative Attention](https://arxiv.org/abs/2103.03206), ICML 2021 + +[FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482), arXiv 2021 + +[PolyViT: Co-training Vision Transformers on Images, Videos and Audio](https://arxiv.org/abs/2111.12993), arXiv 2021 + +[VATT: Transformers for Multimodal Self-Supervised Learning from Raw Video, Audio and Text](https://arxiv.org/abs/2104.11178), NeurIPS 2021 [[code]](https://github.com/google-research/google-research/tree/master/vatt) + +[Parameter Efficient Multimodal Transformers for Video Representation Learning](https://arxiv.org/abs/2012.04124), ICLR 2021 [[code]](https://github.com/sangho-vision/avbert) + +### Multimodal Memory + +[Multimodal Transformer with Variable-length Memory for Vision-and-Language Navigation](https://arxiv.org/abs/2111.05759), arXiv 2021 + +[History Aware Multimodal Transformer for Vision-and-Language Navigation](https://arxiv.org/abs/2110.13309), NeurIPS 2021 [[code]](https://cshizhe.github.io/projects/vln_hamt.html) + +[Episodic Memory in Lifelong Language Learning](https://arxiv.org/abs/1906.01076), NeurIPS 2019 + +[ICON: Interactive Conversational Memory Network for Multimodal Emotion Detection](https://aclanthology.org/D18-1280.pdf), EMNLP 2018 + +[Multimodal Memory Modelling for Video Captioning](https://arxiv.org/abs/1611.05592), CVPR 2018 + +[Dynamic Memory Networks for Visual and Textual Question Answering](https://arxiv.org/abs/1603.01417), ICML 2016 + +## Applications and Datasets + +### Language and Visual QA + +[TAG: Boosting Text-VQA via Text-aware Visual Question-answer Generation](https://arxiv.org/abs/2208.01813), arXiv 2022 [[code]](https://github.com/HenryJunW/TAG) + +[Learning to Answer Questions in Dynamic Audio-Visual Scenarios](https://arxiv.org/abs/2203.14072), CVPR 2022 + +[SUTD-TrafficQA: A Question Answering Benchmark and an Efficient Network for Video Reasoning over Traffic Events](https://openaccess.thecvf.com/content/CVPR2021/html/Xu_SUTD-TrafficQA_A_Question_Answering_Benchmark_and_an_Efficient_Network_for_CVPR_2021_paper.html), CVPR 2021 [[code]](https://github.com/SUTDCV/SUTD-TrafficQA) + +[MultiModalQA: complex question answering over text, tables and images](https://openreview.net/forum?id=ee6W5UgQLa), ICLR 2021 + +[ManyModalQA: Modality Disambiguation and QA over Diverse Inputs](https://arxiv.org/abs/2001.08034), AAAI 2020 [[code]](https://github.com/hannandarryl/ManyModalQA) + +[Iterative Answer Prediction with Pointer-Augmented Multimodal Transformers for TextVQA](https://arxiv.org/abs/1911.06258), CVPR 2020 + +[Interactive Language Learning by Question Answering](https://arxiv.org/abs/1908.10909), EMNLP 2019 [[code]](https://github.com/xingdi-eric-yuan/qait_public) + +[Fusion of Detected Objects in Text for Visual Question Answering](https://arxiv.org/abs/1908.05054), arXiv 2019 + +[RUBi: Reducing Unimodal Biases in Visual Question Answering](https://arxiv.org/abs/1906.10169), NeurIPS 2019 [[code]](https://github.com/cdancette/rubi.bootstrap.pytorch) + +[GQA: A New Dataset for Real-World Visual Reasoning and Compositional Question Answering](https://arxiv.org/abs/1902.09506), CVPR 2019 [[code]](https://cs.stanford.edu/people/dorarad/gqa/) + +[OK-VQA: A Visual Question Answering Benchmark Requiring External Knowledge](https://arxiv.org/abs/1906.00067), CVPR 2019 [[code]](http://okvqa.allenai.org/) + +[MUREL: Multimodal Relational Reasoning for Visual Question Answering](https://arxiv.org/abs/1902.09487), CVPR 2019 [[code]](https://github.com/Cadene/murel.bootstrap.pytorch) + +[Social-IQ: A Question Answering Benchmark for Artificial Social Intelligence](http://openaccess.thecvf.com/content_CVPR_2019/html/Zadeh_Social-IQ_A_Question_Answering_Benchmark_for_Artificial_Social_Intelligence_CVPR_2019_paper.html), CVPR 2019 [[code]](https://github.com/A2Zadeh/Social-IQ) + +[Probabilistic Neural-symbolic Models for Interpretable Visual Question Answering](https://arxiv.org/abs/1902.07864), ICML 2019 [[code]](https://github.com/kdexd/probnmn-clevr) + +[Learning to Count Objects in Natural Images for Visual Question Answering](https://arxiv.org/abs/1802.05766), ICLR 2018, [[code]](https://github.com/Cyanogenoid/vqa-counting) + +[Overcoming Language Priors in Visual Question Answering with Adversarial Regularization](https://arxiv.org/abs/1810.03649), NeurIPS 2018 + +[Neural-Symbolic VQA: Disentangling Reasoning from Vision and Language Understanding](https://arxiv.org/abs/1810.02338), NeurIPS 2018 [[code]](https://github.com/kexinyi/ns-vqa) + +[RecipeQA: A Challenge Dataset for Multimodal Comprehension of Cooking Recipes](https://arxiv.org/abs/1809.00812), EMNLP 2018 [[code]](https://hucvl.github.io/recipeqa/) + +[TVQA: Localized, Compositional Video Question Answering](https://www.aclweb.org/anthology/D18-1167), EMNLP 2018 [[code]](https://github.com/jayleicn/TVQA) + +[Bottom-Up and Top-Down Attention for Image Captioning and Visual Question Answering](https://arxiv.org/abs/1707.07998), CVPR 2018 [[code]](https://github.com/facebookresearch/pythia) + +[Don't Just Assume; Look and Answer: Overcoming Priors for Visual Question Answering](https://arxiv.org/abs/1712.00377), CVPR 2018 [[code]](https://github.com/AishwaryaAgrawal/GVQA) + +[Stacked Latent Attention for Multimodal Reasoning](http://openaccess.thecvf.com/content_cvpr_2018/papers/Fan_Stacked_Latent_Attention_CVPR_2018_paper.pdf), CVPR 2018 + +[Learning to Reason: End-to-End Module Networks for Visual Question Answering](https://arxiv.org/abs/1704.05526), ICCV 2017 [[code]](https://github.com/ronghanghu/n2nmn) + +[CLEVR: A Diagnostic Dataset for Compositional Language and Elementary Visual Reasoning](https://arxiv.org/abs/1612.06890), CVPR 2017 [[code]](https://github.com/facebookresearch/clevr-iep) [[dataset generation]](https://github.com/facebookresearch/clevr-dataset-gen) + +[Are You Smarter Than A Sixth Grader? Textbook Question Answering for Multimodal Machine Comprehension](https://ieeexplore.ieee.org/document/8100054/), CVPR 2017 [[code]](http://vuchallenge.org/tqa.html) + +[Multimodal Compact Bilinear Pooling for Visual Question Answering and Visual Grounding](https://arxiv.org/abs/1606.01847), EMNLP 2016 [[code]](https://github.com/akirafukui/vqa-mcb) + +[MovieQA: Understanding Stories in Movies through Question-Answering](https://arxiv.org/abs/1512.02902), CVPR 2016 [[code]](http://movieqa.cs.toronto.edu/home/) + +[VQA: Visual Question Answering](https://arxiv.org/abs/1505.00468), ICCV 2015 [[code]](https://visualqa.org/) + +### Language Grounding in Vision + +[Core Challenges in Embodied Vision-Language Planning](https://arxiv.org/abs/2106.13948), arXiv 2021 + +[MaRVL: Multicultural Reasoning over Vision and Language](https://arxiv.org/pdf/2109.13238), EMNLP 2021 [[code]](https://marvl-challenge.github.io/) + +[Grounding 'Grounding' in NLP](https://arxiv.org/abs/2106.02192), ACL 2021 + +[The Hateful Memes Challenge: Detecting Hate Speech in Multimodal Memes](https://arxiv.org/abs/2005.04790), NeurIPS 2020 [[code]](https://ai.facebook.com/blog/hateful-memes-challenge-and-data-set/) + +[What Does BERT with Vision Look At?](https://www.aclweb.org/anthology/2020.acl-main.469/), ACL 2020 + +[Visual Grounding in Video for Unsupervised Word Translation](https://arxiv.org/abs/2003.05078), CVPR 2020 [[code]](https://github.com/gsig/visual-grounding) + +[VIOLIN: A Large-Scale Dataset for Video-and-Language Inference](https://arxiv.org/abs/2003.11618), CVPR 2020 [[code]](https://github.com/jimmy646/violin) + +[Grounded Video Description](https://arxiv.org/abs/1812.06587), CVPR 2019 + +[Show, Control and Tell: A Framework for Generating Controllable and Grounded Captions](https://arxiv.org/abs/1811.10652), CVPR 2019 + +[Multilevel Language and Vision Integration for Text-to-Clip Retrieval](https://arxiv.org/abs/1804.05113), AAAI 2019 [[code]](https://github.com/VisionLearningGroup/Text-to-Clip_Retrieval) + +[Binary Image Selection (BISON): Interpretable Evaluation of Visual Grounding](https://arxiv.org/abs/1901.06595), arXiv 2019 [[code]](https://github.com/facebookresearch/binary-image-selection) + +[Finding โ€œItโ€: Weakly-Supervised Reference-Aware Visual Grounding in Instructional Videos](http://openaccess.thecvf.com/content_cvpr_2018/papers/Huang_Finding_It_Weakly-Supervised_CVPR_2018_paper.pdf), CVPR 2018 + +[SCAN: Learning Hierarchical Compositional Visual Concepts](https://arxiv.org/abs/1707.03389), ICLR 2018 + +[Visual Coreference Resolution in Visual Dialog using Neural Module Networks](https://arxiv.org/abs/1809.01816), ECCV 2018 [[code]](https://github.com/facebookresearch/corefnmn) + +[Gated-Attention Architectures for Task-Oriented Language Grounding](https://arxiv.org/abs/1706.07230), AAAI 2018 [[code]](https://github.com/devendrachaplot/DeepRL-Grounding) + +[Using Syntax to Ground Referring Expressions in Natural Images](https://arxiv.org/abs/1805.10547), AAAI 2018 [[code]](https://github.com/volkancirik/groundnet) + +[Grounding language acquisition by training semantic parsers using captioned videos](https://cbmm.mit.edu/sites/default/files/publications/Ross-et-al_ACL2018_Grounding%20language%20acquisition%20by%20training%20semantic%20parsing%20using%20caption%20videos.pdf), ACL 2018 + +[Interpretable and Globally Optimal Prediction for Textual Grounding using Image Concepts](https://arxiv.org/abs/1803.11209), NeurIPS 2017 + +[Localizing Moments in Video with Natural Language](https://arxiv.org/abs/1708.01641), ICCV 2017 + +[What are you talking about? Text-to-Image Coreference](https://ieeexplore.ieee.org/abstract/document/6909850/), CVPR 2014 + +[Grounded Language Learning from Video Described with Sentences](https://www.aclweb.org/anthology/P13-1006), ACL 2013 + +[Grounded Compositional Semantics for Finding and Describing Images with Sentences](https://nlp.stanford.edu/~socherr/SocherKarpathyLeManningNg_TACL2013.pdf), TACL 2013 + +### Language Grouding in Navigation + +[ALFWorld: Aligning Text and Embodied Environments for Interactive Learning](https://arxiv.org/abs/2010.03768), ICLR 2021 [[code]](http://alfworld.github.io/) + +[Hierarchical Cross-Modal Agent for Robotics Vision-and-Language Navigation](https://arxiv.org/abs/2104.10674), ICRA 2021, [[code]](https://github.com/GT-RIPL/robo-vln), [[video]](https://www.youtube.com/watch?v=y16x9n_zP_4), [[project page]](https://zubair-irshad.github.io/projects/robo-vln.html) + +[Improving Vision-and-Language Navigation with Image-Text Pairs from the Web](https://arxiv.org/abs/2004.14973), ECCV 2020 + +[Towards Learning a Generic Agent for Vision-and-Language Navigation via Pre-training](https://arxiv.org/abs/2002.10638), CVPR 2020 [[code]](https://github.com/weituo12321/PREVALENT) + +[VideoNavQA: Bridging the Gap between Visual and Embodied Question Answering](https://arxiv.org/abs/1908.04950), BMVC 2019 [[code]](https://github.com/catalina17/VideoNavQA) + +[Vision-and-Dialog Navigation](https://arxiv.org/abs/1907.04957), arXiv 2019 [[code]](https://github.com/mmurray/cvdn) + +[Hierarchical Decision Making by Generating and Following Natural Language Instructions](https://arxiv.org/abs/1906.00744), arXiv 2019 [[code]](https://www.minirts.net/) + +[Stay on the Path: Instruction Fidelity in Vision-and-Language Navigation](https://arxiv.org/abs/1905.12255), ACL 2019 + +[Are You Looking? Grounding to Multiple Modalities in Vision-and-Language Navigation](https://arxiv.org/abs/1906.00347), ACL 2019 + +[Touchdown: Natural Language Navigation and Spatial Reasoning in Visual Street Environments](https://arxiv.org/abs/1811.12354), CVPR 2019 [[code]](https://github.com/lil-lab/touchdown) + +[Reinforced Cross-Modal Matching and Self-Supervised Imitation Learning for Vision-Language Navigation](https://arxiv.org/abs/1811.10092), CVPR 2019 + +[The Regretful Navigation Agent for Vision-and-Language Navigation](https://arxiv.org/abs/1903.01602), CVPR 2019 [[code]](https://github.com/chihyaoma/regretful-agent) + +[Tactical Rewind: Self-Correction via Backtracking in Vision-and-Language Navigation](https://arxiv.org/abs/1903.02547), CVPR 2019 [[code]](https://github.com/Kelym/FAST) + +[Multi-modal Discriminative Model for Vision-and-Language Navigation](https://www.aclweb.org/anthology/W19-1605), NAACL SpLU-RoboNLP Workshop 2019 + +[Self-Monitoring Navigation Agent via Auxiliary Progress Estimation](https://arxiv.org/abs/1901.03035), ICLR 2019 [[code]](https://github.com/chihyaoma/selfmonitoring-agent) + +[From Language to Goals: Inverse Reinforcement Learning for Vision-Based Instruction Following](https://arxiv.org/abs/1902.07742), ICLR 2019 + +[Read, Watch, and Move: Reinforcement Learning for Temporally Grounding Natural Language Descriptions in Videos](https://arxiv.org/abs/1901.06829), AAAI 2019 + +[Learning to Navigate Unseen Environments: Back Translation with Environmental Dropout](https://www.aclweb.org/anthology/N19-1268), NAACL 2019 [[code]](https://github.com/airsplay/R2R-EnvDrop) + +[Attention Based Natural Language Grounding by Navigating Virtual Environment](https://arxiv.org/abs/1804.08454), IEEE WACV 2019 + +[Mapping Instructions to Actions in 3D Environments with Visual Goal Prediction](https://arxiv.org/abs/1809.00786), EMNLP 2018 [[code]](https://github.com/lil-lab/ciff) + +[Vision-and-Language Navigation: Interpreting Visually-Grounded Navigation Instructions in Real Environments](https://arxiv.org/abs/1711.07280), CVPR 2018 [[code]](https://bringmeaspoon.org/) + +[Embodied Question Answering](https://arxiv.org/abs/1711.11543), CVPR 2018 [[code]](https://embodiedqa.org/) + +[Look Before You Leap: Bridging Model-Free and Model-Based Reinforcement Learning for Planned-Ahead Vision-and-Language Navigation](https://arxiv.org/abs/1803.07729), ECCV 2018 + +### Multimodal Machine Translation + +[Unsupervised Multimodal Neural Machine Translation with Pseudo Visual Pivoting](https://arxiv.org/abs/2005.03119), ACL 2020 + +[Multimodal Transformer for Multimodal Machine Translation](https://www.aclweb.org/anthology/2020.acl-main.400/), ACL 2020 + +[Neural Machine Translation with Universal Visual Representation](https://openreview.net/forum?id=Byl8hhNYPS), ICLR 2020 [[code]](https://github.com/cooelf/UVR-NMT) + +[Visual Agreement Regularized Training for Multi-Modal Machine Translation](https://arxiv.org/abs/1912.12014), AAAI 2020 + +[VATEX: A Large-Scale, High-Quality Multilingual Dataset for Video-and-Language Research](https://arxiv.org/abs/1904.03493), ICCV 2019 [[code]](http://vatex.org/main/index.html) + +[Latent Variable Model for Multi-modal Translation](https://arxiv.org/pdf/1811.00357), ACL 2019 + +[Distilling Translations with Visual Awareness](https://arxiv.org/pdf/1906.07701), ACL 2019 + +[Probing the Need for Visual Context in Multimodal Machine Translation](https://www.aclweb.org/anthology/N19-1422), NAACL 2019 + +[Emergent Translation in Multi-Agent Communication](https://openreview.net/pdf?id=H1vEXaxA-), ICLR 2018 + +[Zero-Resource Neural Machine Translation with Multi-Agent Communication Game](https://arxiv.org/pdf/1802.03116), AAAI 2018 + +[Learning Translations via Images with a Massively Multilingual Image Dataset](http://aclweb.org/anthology/P18-1239), ACL 2018 + +[A Visual Attention Grounding Neural Model for Multimodal Machine Translation](http://aclweb.org/anthology/D18-1400), EMNLP 2018 + +[Adversarial Evaluation of Multimodal Machine Translation](http://aclweb.org/anthology/D18-1329), EMNLP 2018 + +[Doubly-Attentive Decoder for Multi-modal Neural Machine Translation](http://aclweb.org/anthology/P17-1175), ACL 2017 [[code]](https://github.com/iacercalixto/MultimodalNMT) + +[An empirical study on the effectiveness of images in Multimodal Neural Machine Translation](http://aclweb.org/anthology/D17-1095), EMNLP 2017 + +[Incorporating Global Visual Features into Attention-based Neural Machine Translation](http://aclweb.org/anthology/D17-1105), EMNLP 2017 [[code]](https://github.com/iacercalixto/MultimodalNMT) + +[Multimodal Pivots for Image Caption Translation](http://aclweb.org/anthology/P16-1227), ACL 2016 + +[Multi30K: Multilingual English-German Image Descriptions](https://aclweb.org/anthology/W16-3210.pdf), ACL Workshop on Language and Vision 2016 [[code]](https://github.com/multi30k/dataset) + +[Does Multimodality Help Human and Machine for Translation and Image Captioning?](http://www.statmt.org/wmt16/pdf/W16-2358.pdf), ACL WMT 2016 + +### Multi-agent Communication + +[Multi-agent Communication meets Natural Language: Synergies between Functional and Structural Language Learning](https://arxiv.org/abs/2005.07064), ACL 2020 + +[Emergence of Compositional Language with Deep Generational Transmission](https://arxiv.org/abs/1904.09067), ICML 2019 + +[On the Pitfalls of Measuring Emergent Communication](https://arxiv.org/abs/1903.05168), AAMAS 2019 [[code]](https://github.com/facebookresearch/measuring-emergent-comm) + +[Emergent Translation in Multi-Agent Communication](https://arxiv.org/abs/1710.06922), ICLR 2018 [[code]](https://github.com/facebookresearch/translagent) + +[Emergent Communication in a Multi-Modal, Multi-Step Referential Game](https://openreview.net/pdf?id=rJGZq6g0-), ICLR 2018 [[code]](https://github.com/nyu-dl/MultimodalGame) + +[Emergence of Linguistic Communication From Referential Games with Symbolic and Pixel Input](https://openreview.net/pdf?id=HJGv1Z-AW), ICLR 2018 + +[Emergent Communication through Negotiation](https://openreview.net/pdf?id=Hk6WhagRW), ICLR 2018 [[code]](https://github.com/ASAPPinc/emergent_comms_negotiation) + +[Emergence of Grounded Compositional Language in Multi-Agent Populations](https://arxiv.org/abs/1703.04908), AAAI 2018 + +[Emergence of Language with Multi-agent Games: Learning to Communicate with Sequences of Symbols](https://arxiv.org/abs/1705.11192), NeurIPS 2017 + +[Natural Language Does Not Emerge 'Naturally' in Multi-Agent Dialog](https://arxiv.org/abs/1706.08502), EMNLP 2017 [[code1]](https://github.com/batra-mlp-lab/lang-emerge) [[code2]](https://github.com/kdexd/lang-emerge-parlai) + +[Learning Cooperative Visual Dialog Agents with Deep Reinforcement Learning](https://arxiv.org/abs/1703.06585), ICCV 2017 [code](https://github.com/batra-mlp-lab/visdial-rl) + +[Multi-agent Cooperation and the Emergence of (natural) Language](https://arxiv.org/abs/1612.07182), ICLR 2017 + +[Learning to Communicate with Deep Multi-agent Reinforcement Learning](https://arxiv.org/abs/1605.06676), NIPS 2016. + +[Learning multiagent communication with backpropagation](http://papers.nips.cc/paper/6398-learning-multiagent-communication-with-backpropagation.pdf), NIPS 2016. + +[The Emergence of Compositional Structures in Perceptually Grounded Language Games](https://www.cs.utexas.edu/~kuipers/readings/Vogt-aij-05.pdf), AI 2005 + +### Commonsense Reasoning + +[Adventures in Flatland: Perceiving Social Interactions Under Physical Dynamics](https://www.tshu.io/HeiderSimmel/CogSci20/Flatland_CogSci20.pdf), CogSci 2020 + +[A Logical Model for Supporting Social Commonsense Knowledge Acquisition](https://arxiv.org/abs/1912.11599), arXiv 2019 + +[Heterogeneous Graph Learning for Visual Commonsense Reasoning](https://arxiv.org/abs/1910.11475), NeurIPS 2019 + +[SocialIQA: Commonsense Reasoning about Social Interactions](https://arxiv.org/abs/1904.09728), arXiv 2019 + +[From Recognition to Cognition: Visual Commonsense Reasoning](https://arxiv.org/abs/1811.10830), CVPR 2019 [[code]](https://visualcommonsense.com/) + +[CommonsenseQA: A Question Answering Challenge Targeting Commonsense Knowledge](https://arxiv.org/abs/1811.00937), NAACL 2019 + +### Multimodal Reinforcement Learning + +[MiniHack the Planet: A Sandbox for Open-Ended Reinforcement Learning Research](https://arxiv.org/abs/2109.13202), NeurIPS 2021 [[code]](https://github.com/facebookresearch/minihack) + +[Imitating Interactive Intelligence](https://arxiv.org/abs/2012.05672), arXiv 2020 + +[Grounded Language Learning Fast and Slow](https://arxiv.org/abs/2009.01719), ICLR 2021 + +[RTFM: Generalising to Novel Environment Dynamics via Reading](https://arxiv.org/abs/1910.08210), ICLR 2020 [[code]](https://github.com/facebookresearch/RTFM) + +[Embodied Multimodal Multitask Learning](https://arxiv.org/abs/1902.01385), IJCAI 2020 + +[Learning to Speak and Act in a Fantasy Text Adventure Game](https://arxiv.org/abs/1903.03094), arXiv 2019 [[code]](https://parl.ai/projects/light/) + +[Language as an Abstraction for Hierarchical Deep Reinforcement Learning](https://arxiv.org/abs/1906.07343), NeurIPS 2019 + +[Hierarchical Decision Making by Generating and Following Natural Language Instructions](https://arxiv.org/abs/1906.00744), NeurIPS 2019 [[code]](https://github.com/facebookresearch/minirts) + +[Habitat: A Platform for Embodied AI Research](https://arxiv.org/abs/1904.01201), ICCV 2019 [[code]](https://aihabitat.org/) + +[Multimodal Hierarchical Reinforcement Learning Policy for Task-Oriented Visual Dialog](https://arxiv.org/abs/1805.03257), SIGDIAL 2018 + +[Mapping Instructions and Visual Observations to Actions with Reinforcement Learning](https://www.cs.cornell.edu/~dkm/papers/mla-emnlp.2017.pdf), EMNLP 2017 + +[Reinforcement Learning for Mapping Instructions to Actions](https://people.csail.mit.edu/regina/my_papers/RL.pdf), ACL 2009 + +### Multimodal Dialog + +[Two Causal Principles for Improving Visual Dialog](https://arxiv.org/abs/1911.10496), CVPR 2020 + +[MELD: A Multimodal Multi-Party Dataset for Emotion Recognition in Conversations](https://arxiv.org/abs/1810.02508), ACL 2019 [[code]](http://affective-meld.github.io/) + +[CLEVR-Dialog: A Diagnostic Dataset for Multi-Round Reasoning in Visual Dialog](https://www.aclweb.org/anthology/N19-1058), NAACL 2019 [[code]](https://github.com/satwikkottur/clevr-dialog) + +[Talk the Walk: Navigating New York City through Grounded Dialogue](https://arxiv.org/abs/1807.03367), arXiv 2018 + +[Dialog-based Interactive Image Retrieval](https://arxiv.org/abs/1805.00145), NeurIPS 2018 [[code]](https://github.com/XiaoxiaoGuo/fashion-retrieval) + +[Towards Building Large Scale Multimodal Domain-Aware Conversation Systems](https://arxiv.org/abs/1704.00200), arXiv 2017 [[code]](https://amritasaha1812.github.io/MMD/) + +[Visual Dialog](https://arxiv.org/abs/1611.08669), CVPR 2017 [[code]](https://github.com/batra-mlp-lab/visdial) + +### Language and Audio + +[Lattice Transformer for Speech Translation](https://arxiv.org/abs/1906.05551), ACL 2019 + +[Exploring Phoneme-Level Speech Representations for End-to-End Speech Translation](https://arxiv.org/abs/1906.01199), ACL 2019 + +[Audio Caption: Listen and Tell](https://arxiv.org/abs/1902.09254), ICASSP 2019 + +[Audio-Linguistic Embeddings for Spoken Sentences](https://arxiv.org/abs/1902.07817), ICASSP 2019 + +[From Semi-supervised to Almost-unsupervised Speech Recognition with Very-low Resource by Jointly Learning Phonetic Structures from Audio and Text Embeddings](https://arxiv.org/abs/1904.05078), arXiv 2019 + +[From Audio to Semantics: Approaches To End-to-end Spoken Language Understanding](https://arxiv.org/abs/1809.09190), arXiv 2018 + +[Natural TTS Synthesis by Conditioning Wavenet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884), ICASSP 2018 [[code]](https://github.com/NVIDIA/tacotron2) + +[Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning](https://arxiv.org/abs/1710.07654), ICLR 2018 + +[Deep Voice 2: Multi-Speaker Neural Text-to-Speech](https://arxiv.org/abs/1705.08947), NeurIPS 2017 + +[Deep Voice: Real-time Neural Text-to-Speech](https://arxiv.org/abs/1702.07825), ICML 2017 + +[Text-to-Speech Synthesis](https://dl.acm.org/citation.cfm?id=1592988), 2009 + +### Audio and Visual + +[Music Gesture for Visual Sound Separation](https://arxiv.org/abs/2004.09476), CVPR 2020 + +[Co-Compressing and Unifying Deep CNN Models for Efficient Human Face and Speaker Recognition](http://openaccess.thecvf.com/content_CVPRW_2019/papers/MULA/Wan_Co-Compressing_and_Unifying_Deep_CNN_Models_for_Efficient_Human_Face_CVPRW_2019_paper.pdf), CVPRW 2019 + +[Learning Individual Styles of Conversational Gesture](https://arxiv.org/abs/1906.04160), CVPR 2019 [[code]](http://people.eecs.berkeley.edu/~shiry/speech2gesture) + +[Capture, Learning, and Synthesis of 3D Speaking Styles](https://ps.is.tuebingen.mpg.de/uploads_file/attachment/attachment/510/paper_final.pdf), CVPR 2019 [[code]](https://github.com/TimoBolkart/voca) + +[Disjoint Mapping Network for Cross-modal Matching of Voices and Faces](https://arxiv.org/abs/1807.04836), ICLR 2019 + +[Wav2Pix: Speech-conditioned Face Generation using Generative Adversarial Networks](https://arxiv.org/abs/1903.10195), ICASSP 2019 [[code]](https://imatge-upc.github.io/wav2pix/) + +[Learning Affective Correspondence between Music and Image](https://arxiv.org/abs/1904.00150), ICASSP 2019 [[dataset]](https://gaurav22verma.github.io/IMAC_Dataset.html) + +[Jointly Discovering Visual Objects and Spoken Words from Raw Sensory Input](https://arxiv.org/abs/1804.01452), ECCV 2018 [[code]](https://github.com/LiqunChen0606/Jointly-Discovering-Visual-Objects-and-Spoken-Words) + +[Seeing Voices and Hearing Faces: Cross-modal Biometric Matching](https://arxiv.org/abs/1804.00326), CVPR 2018 [[code]](https://github.com/a-nagrani/SVHF-Net) + +[Learning to Separate Object Sounds by Watching Unlabeled Video](http://openaccess.thecvf.com/content_cvpr_2018_workshops/papers/w49/Gao_Learning_to_Separate_CVPR_2018_paper.pdf), CVPR 2018 + +[Deep Audio-Visual Speech Recognition](https://arxiv.org/abs/1809.02108), IEEE TPAMI 2018 + +[Look, Listen and Learn](http://openaccess.thecvf.com/content_ICCV_2017/papers/Arandjelovic_Look_Listen_and_ICCV_2017_paper.pdf), ICCV 2017 + +[Unsupervised Learning of Spoken Language with Visual Context](https://papers.nips.cc/paper/6186-unsupervised-learning-of-spoken-language-with-visual-context.pdf), NeurIPS 2016 + +[SoundNet: Learning Sound Representations from Unlabeled Video](https://arxiv.org/abs/1610.09001), NeurIPS 2016 [[code]](http://projects.csail.mit.edu/soundnet/) + +### Visual, IMU and Wireless +[Vi-Fi: Associating Moving Subjects across Vision and Wireless Sensors](https://ieeexplore.ieee.org/document/9826015), IPSN 2022 [[code]](https://github.com/vifi2021/Vi-Fi) + +### Media Description + +[Towards Unsupervised Image Captioning with Shared Multimodal Embeddings](https://arxiv.org/abs/1908.09317), ICCV 2019 + +[Video Relationship Reasoning using Gated Spatio-Temporal Energy Graph](https://arxiv.org/abs/1903.10547), CVPR 2019 [[code]](https://github.com/yaohungt/GSTEG_CVPR_2019) + +[Joint Event Detection and Description in Continuous Video Streams](https://arxiv.org/abs/1802.10250), WACVW 2019 + +[Learning to Compose and Reason with Language Tree Structures for Visual Grounding](https://arxiv.org/abs/1906.01784), TPAMI 2019 + +[Neural Baby Talk](https://arxiv.org/abs/1803.09845), CVPR 2018 [[code]](https://github.com/jiasenlu/NeuralBabyTalk) + +[Grounding Referring Expressions in Images by Variational Context](https://arxiv.org/abs/1712.01892), CVPR 2018 + +[Video Captioning via Hierarchical Reinforcement Learning](https://arxiv.org/abs/1711.11135), CVPR 2018 + +[Charades-Ego: A Large-Scale Dataset of Paired Third and First Person Videos](https://arxiv.org/abs/1804.09626), CVPR 2018 [[code]](https://allenai.org/plato/charades/) + +[Neural Motifs: Scene Graph Parsing with Global Context](https://arxiv.org/abs/1711.06640), CVPR 2018 [[code]](http://github.com/rowanz/neural-motifs) + +[No Metrics Are Perfect: Adversarial Reward Learning for Visual Storytelling](https://arxiv.org/abs/1804.09160), ACL 2018 + +[Generating Descriptions with Grounded and Co-Referenced People](https://arxiv.org/abs/1704.01518), CVPR 2017 + +[DenseCap: Fully Convolutional Localization Networks for Dense Captioning](https://cs.stanford.edu/people/karpathy/densecap/), CVPR 2016 + +[Review Networks for Caption Generation](https://arxiv.org/abs/1605.07912), NeurIPS 2016 [[code]](https://github.com/kimiyoung/review_net) + +[Hollywood in Homes: Crowdsourcing Data Collection for Activity Understanding](https://arxiv.org/abs/1604.01753), ECCV 2016 [[code]](https://allenai.org/plato/charades/) + +[Show and Tell: Lessons learned from the 2015 MSCOCO Image Captioning Challenge](https://arxiv.org/abs/1609.06647), TPAMI 2016 [[code]](https://github.com/tensorflow/models/tree/master/research/im2txt) + +[Show, Attend and Tell: Neural Image Caption Generation with Visual Attention](https://arxiv.org/abs/1502.03044), ICML 2015 [[code]](https://github.com/kelvinxu/arctic-captions) + +[Deep Visual-Semantic Alignments for Generating Image Descriptions](https://arxiv.org/abs/1412.2306v2), CVPR 2015 [[code]](https://github.com/karpathy/neuraltalk2) + +[Show and Tell: A Neural Image Caption Generator](https://arxiv.org/abs/1411.4555), CVPR 2015 [[code]](https://github.com/karpathy/neuraltalk2) + +[A Dataset for Movie Description](https://arxiv.org/abs/1501.02530), CVPR 2015 [[code]](https://www.mpi-inf.mpg.de/departments/computer-vision-and-multimodal-computing/research/vision-and-language/mpii-movie-description-dataset/) + +[Whatโ€™s Cookinโ€™? Interpreting Cooking Videos using Text, Speech and Vision](https://arxiv.org/abs/1503.01558), NAACL 2015 [[code]](https://github.com/malmaud/whats_cookin) + +[Microsoft COCO: Common Objects in Context](https://arxiv.org/abs/1405.0312), ECCV 2014 [[code]](http://cocodataset.org/#home) + +### Video Generation from Text + +[Image Generation from Scene Graphs](https://arxiv.org/abs/1804.01622), CVPR 2018 + +[Learning to Color from Language](https://arxiv.org/abs/1804.06026), NAACL 2018 + +[Generative Adversarial Text to Image Synthesis](https://arxiv.org/abs/1605.05396), ICML 2016 + +### Affect Recognition and Multimodal Language + +[End-to-end Facial and Physiological Model for Affective Computing and Applications](https://arxiv.org/abs/1912.04711), arXiv 2019 + +[Affective Computing for Large-Scale Heterogeneous Multimedia Data: A Survey](https://arxiv.org/abs/1911.05609), ACM TOMM 2019 + +[Towards Multimodal Sarcasm Detection (An Obviously_Perfect Paper)](https://arxiv.org/abs/1906.01815), ACL 2019 [[code]](https://github.com/soujanyaporia/MUStARD) + +[Multi-modal Approach for Affective Computing](https://arxiv.org/abs/1804.09452), EMBC 2018 + +[Multimodal Language Analysis with Recurrent Multistage Fusion](https://arxiv.org/abs/1808.03920), EMNLP 2018 + +[Multimodal Language Analysis in the Wild: CMU-MOSEI Dataset and Interpretable Dynamic Fusion Graph](http://aclweb.org/anthology/P18-1208), ACL 2018 [[code]](https://github.com/A2Zadeh/CMU-MultimodalSDK) + +[Multi-attention Recurrent Network for Human Communication Comprehension](https://www.aaai.org/ocs/index.php/AAAI/AAAI18/paper/viewFile/17390/16123), AAAI 2018 [[code]](https://github.com/A2Zadeh/CMU-MultimodalSDK) + +[End-to-End Multimodal Emotion Recognition using Deep Neural Networks](https://arxiv.org/abs/1704.08619), arXiv 2017 + +[AMHUSE - A Multimodal dataset for HUmor SEnsing](https://dl.acm.org/citation.cfm?id=3136806), ICMI 2017 [[code]](http://amhuse.phuselab.di.unimi.it/) + +[Decoding Childrenโ€™s Social Behavior](http://www.cbi.gatech.edu/mmdb/docs/mmdb_paper.pdf), CVPR 2013 [[code]](http://www.cbi.gatech.edu/mmdb/) + +[Collecting Large, Richly Annotated Facial-Expression Databases from Movies](http://users.cecs.anu.edu.au/%7Eadhall/Dhall_Goecke_Lucey_Gedeon_M_2012.pdf), IEEE Multimedia 2012 [[code]](https://cs.anu.edu.au/few/AFEW.html) + +[The Interactive Emotional Dyadic Motion Capture (IEMOCAP) Database](https://sail.usc.edu/iemocap/Busso_2008_iemocap.pdf), 2008 [[code]](https://sail.usc.edu/iemocap/) + +### Healthcare + +[Multimodal Co-Attention Transformer for Survival Prediction in Gigapixel Whole Slide Images](https://openaccess.thecvf.com/content/ICCV2021/html/Chen_Multimodal_Co-Attention_Transformer_for_Survival_Prediction_in_Gigapixel_Whole_Slide_ICCV_2021_paper.html), ICCV, 2021 + +[PET-Guided Attention Network for Segmentation of Lung Tumors from PET/CT Images](https://rdcu.be/c8WWl), GCPR 2020 [[code]](https://github.com/pvk95/PAG) + +[Pathomic Fusion: An Integrated Framework for Fusing Histopathology and Genomic Features for Cancer Diagnosis and Prognosis](https://arxiv.org/abs/1912.08937), IEEE TMI, 2020 + +[Leveraging Medical Visual Question Answering with Supporting Facts](https://arxiv.org/abs/1905.12008), arXiv 2019 + +[Unsupervised Multimodal Representation Learning across Medical Images and Reports](https://arxiv.org/abs/1811.08615), ML4H 2018 + +[Multimodal Medical Image Retrieval based on Latent Topic Modeling](https://aiforsocialgood.github.io/2018/pdfs/track1/75_aisg_neurips2018.pdf), ML4H 2018 + +[Improving Hospital Mortality Prediction with Medical Named Entities and Multimodal Learning](https://arxiv.org/abs/1811.12276), ML4H 2018 + +[Knowledge-driven Generative Subspaces for Modeling Multi-view Dependencies in Medical Data](https://arxiv.org/abs/1812.00509), ML4H 2018 + +[Multimodal Depression Detection: Fusion Analysis of Paralinguistic, Head Pose and Eye Gaze Behaviors](https://ieeexplore.ieee.org/document/7763752), TAC 2018 + +[Learning the Joint Representation of Heterogeneous Temporal Events for Clinical Endpoint Prediction](https://arxiv.org/abs/1803.04837), AAAI 2018 + +[Understanding Coagulopathy using Multi-view Data in the Presence of Sub-Cohorts: A Hierarchical Subspace Approach](http://mucmd.org/CameraReadySubmissions/67%5CCameraReadySubmission%5Cunderstanding-coagulopathy-multi%20(6).pdf), MLHC 2017 + +[Machine Learning in Multimodal Medical Imaging](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5357511/), 2017 + +[Cross-modal Recurrent Models for Weight Objective Prediction from Multimodal Time-series Data](https://arxiv.org/abs/1709.08073), ML4H 2017 + +[SimSensei Kiosk: A Virtual Human Interviewer for Healthcare Decision Support](https://dl.acm.org/citation.cfm?id=2617388.2617415), AAMAS 2014 + +[Dyadic Behavior Analysis in Depression Severity Assessment Interviews](https://dl.acm.org/citation.cfm?doid=2663204.2663238), ICMI 2014 + +[Audiovisual Behavior Descriptors for Depression Assessment](https://dl.acm.org/citation.cfm?doid=2522848.2522886), ICMI 2013 + +### Robotics + +[Detect, Reject, Correct: Crossmodal Compensation of Corrupted Sensors](https://arxiv.org/abs/2012.00201), ICRA 2021 + +[Multimodal sensor fusion with differentiable filters](https://arxiv.org/abs/2010.13021), IROS 2020 + +[Concept2Robot: Learning Manipulation Concepts from Instructions and Human Demonstrations](http://www.roboticsproceedings.org/rss16/p082.pdf), RSS 2020 + +[See, Feel, Act: Hierarchical Learning for Complex Manipulation Skills with Multi-sensory Fusion](https://robotics.sciencemag.org/content/4/26/eaav3123), Science Robotics 2019 + +[Early Fusion for Goal Directed Robotic Vision](https://arxiv.org/abs/1811.08824), IROS 2019 + +[Simultaneously Learning Vision and Feature-based Control Policies for Real-world Ball-in-a-Cup](https://arxiv.org/abs/1902.04706), RSS 2019 + +[Probabilistic Multimodal Modeling for Human-Robot Interaction Tasks](http://www.roboticsproceedings.org/rss15/p47.pdf), RSS 2019 + +[Making Sense of Vision and Touch: Self-Supervised Learning of Multimodal Representations for Contact-Rich Tasks](https://arxiv.org/abs/1810.10191), ICRA 2019 + +[Evolving Multimodal Robot Behavior via Many Stepping Stones with the Combinatorial Multi-Objective Evolutionary Algorithm +](https://arxiv.org/abs/1807.03392), arXiv 2018 + +[Multi-modal Predicate Identification using Dynamically Learned Robot Controllers](https://www.cs.utexas.edu/~pstone/Papers/bib2html-links/IJCAI18-saeid.pdf), IJCAI 2018 + +[Multimodal Probabilistic Model-Based Planning for Human-Robot Interaction](https://arxiv.org/abs/1710.09483), arXiv 2017 + +[Perching and Vertical Climbing: Design of a Multimodal Robot](https://ieeexplore.ieee.org/document/6907472), ICRA 2014 + +[Multi-Modal Scene Understanding for Robotic Grasping](http://kth.diva-portal.org/smash/get/diva2:459199/FULLTEXT01), 2011 + +[Strategies for Multi-Modal Scene Exploration](https://am.is.tuebingen.mpg.de/uploads_file/attachment/attachment/307/2010_IROS_bjbk_camred.pdf), IROS 2010 + +### Autonomous Driving + +[Deep Multi-modal Object Detection and Semantic Segmentation for Autonomous Driving: Datasets, Methods, and Challenges](https://arxiv.org/pdf/1902.07830.pdf), IEEE TITS 2020 [[website]](https://boschresearch.github.io/multimodalperception/) + +[nuScenes: A multimodal dataset for autonomous driving](https://openaccess.thecvf.com/content_CVPR_2020/papers/Caesar_nuScenes_A_Multimodal_Dataset_for_Autonomous_Driving_CVPR_2020_paper.pdf), CVPR 2020 [[dataset]](https://www.nuscenes.org/) + +[Multimodal End-to-End Autonomous Driving](https://arxiv.org/abs/1906.03199), arXiv 2020 + +### Finance + +[A Multimodal Event-driven LSTM Model for Stock Prediction Using Online News](https://ailab-ua.github.io/courses/resources/Qing_TKDE_2020.pdf), TKDE 2020 + +[Multimodal Deep Learning for Finance: Integrating and Forecasting International Stock Markets](https://arxiv.org/abs/1903.06478), 2019 + +[Multimodal deep learning for short-term stock volatility prediction](https://arxiv.org/abs/1812.10479), 2018 + +### Human AI Interaction + +[Multimodal Human Computer Interaction: A Survey](https://link.springer.com/chapter/10.1007/11573425_1), HCI 2005 + +[Affective multimodal human-computer interaction](https://dl.acm.org/doi/10.1145/1101149.1101299), Multimedia 2005 + +[Building a multimodal human-robot interface](https://ieeexplore.ieee.org/abstract/document/1183338?casa_token=tdKeY0Q0e-4AAAAA:XfKwp5Di1O5bCEOnebeaS58waSbWm80lxNuY8IhWW7DqDLvRQj-8ettJW1NrFrmoR_ShudTgzw), IEEE Intelligent Systems 2001 + +### Multimodal Content Generation + +[Non-Linear Consumption of Videos Using a Sequence of Personalized Multimodal Fragments](https://gaurav22verma.github.io/assets/papers/NonLinearConsumption.pdf), IUI 2021 + +[Generating Need-Adapted Multimodal Fragments](https://gaurav22verma.github.io/assets/MultimodalFragments.pdf), IUI 2020 + +# Workshops + +[Multimodal KDD 2023: International Workshop on Multimodal Learning](https://multimodal-kdd-2023.github.io), KDD 2023 + +[Multimodal Representation Learning: Perks and Pitfalls](https://mrl-workshop.github.io/iclr-2023/), ICLR 2023 + +[Social Intelligence in Humans and Robots](https://social-intelligence-human-ai.github.io/) @ ICRA 2021 + +[LANTERN 2021](https://www.lantern.uni-saarland.de/2021/): The Third Workshop Beyond Vision and LANguage: inTEgrating Real-world kNowledge @ EACL 2021 + +Multimodal workshops @ CVPR 2021: [Multimodal Learning and Applications](https://mula-workshop.github.io/), [Sight and Sound](http://sightsound.org/), [Visual Question Answering](https://visualqa.org/workshop), [Embodied AI](https://embodied-ai.org/), [Language for 3D Scenes](http://language3dscenes.github.io/). + +Multimodal workshops @ NAACL 2021: [MAI-Workshop](http://multicomp.cs.cmu.edu/naacl2021multimodalworkshop/), [ALVR](https://alvr-workshop.github.io/), [ViGIL](https://vigilworkshop.github.io/). + +ICLR 2021 workshop on [Embodied Multimodal Learning](https://eml-workshop.github.io/). + +NeurIPS 2020 workshop on [Wordplay: When Language Meets Games](https://wordplay-workshop.github.io/). + +ACL 2020 workshops on [Multimodal Language](http://multicomp.cs.cmu.edu/acl2020multimodalworkshop/) [(proceedings)](https://www.aclweb.org/anthology/volumes/2020.challengehml-1/) and [Advances in Language and Vision Research](https://alvr-workshop.github.io/). + +Multimodal workshops @ ECCV 2020: [EVAL](https://askforalfred.com/EVAL/), [CAMP](https://camp-workshop.stanford.edu/), and [MVA](https://sites.google.com/view/multimodalvideo-v2). + +[Multi-Modal Video Reasoning and Analyzing Competition](https://sutdcv.github.io/multi-modal-video-reasoning), ICCV 2021 + +[Grand Challenge and Workshop on Human Multimodal Language](http://multicomp.cs.cmu.edu/acl2020multimodalworkshop/), ACL 2020, ACL 2018 + +[Advances in Language and Vision Research](https://alvr-workshop.github.io/), ACL 2020 + +[Visually Grounded Interaction and Language](https://vigilworkshop.github.io/), NeurIPS 2019, NeurIPS 2018 + +[Emergent Communication: Towards Natural Language](https://sites.google.com/view/emecom2019), NeurIPS 2019 + +[Workshop on Multimodal Understanding and Learning for Embodied Applications](https://sites.google.com/view/mulea2019/home), ACM Multimedia 2019 + +[Beyond Vision and Language: Integrating Real-World Knowledge](https://www.lantern.uni-saarland.de/), EMNLP 2019 + +[The How2 Challenge: New Tasks for Vision & Language](https://srvk.github.io/how2-challenge/), ICML 2019 + +[Visual Question Answering and Dialog](https://visualqa.org/workshop.html), CVPR 2019, CVPR 2017 + +[Multi-modal Learning from Videos](https://sites.google.com/view/mmlv/home), CVPR 2019 + +[Multimodal Learning and Applications Workshop](https://mula-workshop.github.io/), CVPR 2019, ECCV 2018 + +[Habitat: Embodied Agents Challenge and Workshop](https://aihabitat.org/workshop/), CVPR 2019 + +[Closing the Loop Between Vision and Language & LSMD Challenge](https://sites.google.com/site/iccv19clvllsmdc/), ICCV 2019 + +[Multi-modal Video Analysis and Moments in Time Challenge](https://sites.google.com/view/multimodalvideo/), ICCV 2019 + +[Cross-Modal Learning in Real World](https://cromol.github.io/), ICCV 2019 + +[Spatial Language Understanding and Grounded Communication for Robotics](https://splu-robonlp.github.io/), NAACL 2019 + +[YouTube-8M Large-Scale Video Understanding](https://research.google.com/youtube8m/workshop2018/), ICCV 2019, ECCV 2018, CVPR 2017 + +[Language and Vision Workshop](http://languageandvision.com/), CVPR 2019, CVPR 2018, CVPR 2017, CVPR 2015 + +[Sight and Sound](http://sightsound.org/), CVPR 2019, CVPR 2018 + +[The Large Scale Movie Description Challenge (LSMDC)](https://sites.google.com/site/describingmovies/), ICCV 2019, ICCV 2017 + +[Wordplay: Reinforcement and Language Learning in Text-based Games](https://www.wordplay2018.com/), NeurIPS 2018 + +[Interpretability and Robustness in Audio, Speech, and Language](https://irasl.gitlab.io/), NeurIPS 2018 + +[Multimodal Robot Perception](https://natanaso.github.io/rcw-icra18/), ICRA 2018 + +[WMT18: Shared Task on Multimodal Machine Translation](http://www.statmt.org/wmt18/multimodal-task.html), EMNLP 2018 + +[Shortcomings in Vision and Language](https://sites.google.com/view/sivl/), ECCV 2018 + +[Computational Approaches to Subjectivity, Sentiment and Social Media Analysis](https://wt-public.emm4u.eu/wassa2018/), EMNLP 2018, EMNLP 2017, NAACL-HLT 2016, EMNLP 2015, ACL 2014, NAACL-HLT 2013 + +[Visual Understanding Across Modalities](http://vuchallenge.org/), CVPR 2017 + +[International Workshop on Computer Vision for Audio-Visual Media](https://cvavm2017.wordpress.com/), ICCV 2017 + +[Language Grounding for Robotics](https://robo-nlp.github.io/2017_index.html), ACL 2017 + +[Computer Vision for Audio-visual Media](https://cvavm2016.wordpress.com/), ECCV 2016 + +[Language and Vision](https://vision.cs.hacettepe.edu.tr/vl2016/), ACL 2016, EMNLP 2015 + +# Tutorials + +[Tutorial on MultiModal Machine Learning](https://cmu-multicomp-lab.github.io/mmml-tutorial/icml2023/), ICML 2023, CVPR 2022, NAACL 2022 + +[Recent Advances in Vision-and-Language Research](https://rohit497.github.io/Recent-Advances-in-Vision-and-Language-Research/), CVPR 2020 + +[Connecting Language and Vision to Actions](https://lvatutorial.github.io/), ACL 2018 + +[Machine Learning for Clinicians: Advances for Multi-Modal Health Data](https://www.michaelchughes.com/mlhc2018_tutorial.html), MLHC 2018 + +[Multimodal Machine Learning](https://sites.google.com/site/multiml2016cvpr/), ACL 2017, CVPR 2016, ICMI 2016 + +[Vision and Language: Bridging Vision and Language with Deep Learning](https://www.microsoft.com/en-us/research/publication/vision-language-bridging-vision-language-deep-learning/), ICIP 2017 + +# Courses + +[CMU 11-777 Multimodal Machine Learning](https://cmu-multicomp-lab.github.io/mmml-course/fall2022/) + +[CMU 11-877 Advanced Topics in Multimodal Machine Learning](https://cmu-multicomp-lab.github.io/adv-mmml-course/spring2023/) + +[CMU 05-618, Human-AI Interaction](https://haiicmu.github.io/) + +[CMU 11-777, Advanced Multimodal Machine Learning](https://piazza.com/cmu/fall2018/11777/resources) + +[Stanford CS422: Interactive and Embodied Learning](http://cs422interactive.stanford.edu/) + +[CMU 16-785, Integrated Intelligence in Robotics: Vision, Language, and Planning](http://www.cs.cmu.edu/~jeanoh/16-785/) + +[CMU 10-808, Language Grounding to Vision and Control](https://katefvision.github.io/LanguageGrounding/) + +[CMU 11-775, Large-Scale Multimedia Analysis](https://sites.google.com/a/is.cs.cmu.edu/lti-speech-classes/11-775-large-scale-multimedia-analysis) + +[MIT 6.882, Embodied Intelligence](https://phillipi.github.io/6.882/) + +[Georgia Tech CS 8803, Vision and Language](http://www.prism.gatech.edu/~arjun9/CS8803_CVL_Fall17/) + +[Virginia Tech CS 6501-004, Vision & Language](http://www.cs.virginia.edu/~vicente/vislang/) \ No newline at end of file diff --git a/docs/roadmap.md b/docs/roadmap.md new file mode 100644 index 0000000..1324e1e --- /dev/null +++ b/docs/roadmap.md @@ -0,0 +1,139 @@ + +**[Zeta's 3-Step Master Plan for Perfecting Multi-Modality LLMs]** + +--- + +**1. Refinement and Excellence: Perfecting the Framework** + - **[Objective]**: To develop Zeta into the most sophisticated, yet intuitively simple framework for building Multi-Modality LLMs. + + - **[Strategies]** + - **Zeta Innovation Labs**: + * Create a dedicated team of experts who exclusively focus on refining the foundational modules and blocks. + * Prioritize research in areas like advanced self-supervised learning, multi-modal integration, and zero-shot learning. + - **Modularity Focus**: + * Develop plug-and-play modules that allow developers to effortlessly incorporate various data types (text, image, video, audio) into their LLMs. + * Standardize the blocks ensuring consistent performance, error-handling, and interoperability. + - **Performance Optimization**: + * Collaborate with hardware manufacturers to ensure that Zeta is perfectly optimized for cutting-edge GPUs, TPUs, and other specialized hardware. + * Roll out regular updates to keep the framework at the forefront of performance. + +--- + +**2. User-Centric Development: Making Zeta Intuitive** + - **[Objective]**: Ensure that every feature, tool, and module in Zeta aligns with the principle of making LLM creation simpler and more efficient. + + - **[Strategies]** + - **Zeta Academy**: + * Host frequent workshops and webinars targeted at educating users on harnessing the power of Zeta's multi-modality LLM features. + * Create a vast library of tutorials, ranging from beginner to advanced, with real-world examples of LLM implementation. + - **Interactive GUI for LLM Design**: + * Develop a visual interface where users can drag-and-drop modules, visualize their LLM architecture, and see real-time performance metrics. + - **Feedback Loops**: + * Create a robust system to collect and implement feedback. Users should feel like theyโ€™re co-creating Zeta. + * Launch a beta program where selected developers can test new features and provide insights. + +--- + +**3. Scaling and Outreach: From the Labs to the World** + - **[Objective]**: Make Zeta the de facto choice for developers worldwide aiming to craft state-of-the-art Multi-Modality LLMs. + + - **[Strategies]** + - **Zeta Ambassadors**: + * Identify and collaborate with top AI researchers and practitioners globally, making them the face and voice of Zeta in their communities. + - **Strategic Partnerships**: + * Work closely with major tech institutions, universities, and platforms to integrate Zeta into their curriculum or platforms. + * Create an API gateway for seamless integration of Zeta with other popular machine learning and data processing platforms. + - **Global Challenges & Competitions**: + * Organize worldwide LLM challenges, where developers use Zeta to solve real-world problems, bringing attention to both the problems and the capabilities of Zeta. + +--- + + +In every tool, in every line of code, in every module of Zeta, you'll find our relentless pursuit of excellence. But remember, at its core, + +Zeta isn't about us, + +it's about you, the creator. + +It's about giving you the power, the simplicity, and the edge to redefine the boundaries of what's possible. + +With Zeta, weโ€™re not just building a tool; we're crafting the future. + +A future we're eager to see through your eyes. + + + + +------ + + + + + + + + + + + + + + + + + + + + + + + +**[Zeta's 3-Step Master Plan]** + +**1. Cultivate an Ecosystem of Innovation** + - **[Objective]**: Establish an environment where creativity and innovation are paramount. + + - **[Strategies]** + - **Education & Outreach**: + * Launch a series of free online courses, workshops, and webinars to educate developers on the capabilities and advantages of Zeta. + * Partner with top universities and institutions, offering them early access and integrations, fostering a new generation of developers natively trained on Zeta. + - **Zeta Labs**: + * Open a research lab committed to pushing the boundaries of what neural networks can achieve. + * Provide grants, resources, and mentorship to promising projects and startups that choose to build with Zeta. + - **Open Source Philosophy**: + * Release parts of Zeta's core codebase to the public, inviting developers worldwide to contribute, refine, and expand upon the framework. + * Organize hackathons and coding challenges to galvanize the community around real-world problems that Zeta can solve. + +--- + +**2. Seamless Integration & Scalability** + - **[Objective]**: Make Zeta the easiest, most efficient, and most scalable framework to integrate into any project or system. + + - **[Strategies]** + - **Developer Toolkits**: + * Release a suite of tools, plugins, and libraries for all major development platforms and languages, ensuring Zeta is accessible to everyone, everywhere. + - **Zeta Cloud**: + * Offer a cloud solution that allows developers to run, test, and deploy their neural networks seamlessly. This ensures businesses of all sizes can scale without friction. + - **Partnerships**: + * Collaborate with major tech companies, ensuring Zeta's native support on platforms like AWS, Google Cloud, and Azure. + * Establish alliances with hardware manufacturers, optimizing Zeta for the latest GPUs and Neural Network Processors. + +--- + +**3. Build a Community and Cultivate Trust** + - **[Objective]**: Establish Zeta as more than a tool โ€“ it should be a movement, a community of forward-thinkers who believe in redefining the boundaries of neural network capabilities. + + - **[Strategies]** + - **ZetaCon**: + * Annually host a global conference (both offline and online) bringing together the brightest minds in the AI and machine learning sector. It will be a platform for networking, knowledge-sharing, and showcasing the best of what's been built using Zeta. + - **Transparency Reports**: + * Release regular updates about Zeta's development, challenges, successes, and roadmap. + * Actively gather feedback, ensuring the community feels heard and that their insights are valued. + - **Zeta Academy**: + * Create a platform where developers can share their projects, tutorials, and courses about Zeta. Recognize and reward the best contributions to foster a sense of ownership and pride within the community. + +--- + +This isn't just a roadmap. It's our promise, our commitment. Because at the end of the day, it's not about the lines of code we write. It's about the lives we change, the innovations we inspire, and the future we create. And with Zeta, we believe that future is brighter than ever. Let's build it together. + + diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css new file mode 100644 index 0000000..be1c98e --- /dev/null +++ b/docs/stylesheets/extra.css @@ -0,0 +1,4 @@ +:root { + --md-primary-fg-color: #8315F9; + --md-accent-fg-color: #00FFCE; + } \ No newline at end of file diff --git a/docs/zeta/.DS_Store b/docs/zeta/.DS_Store new file mode 100644 index 0000000..95f7139 Binary files /dev/null and b/docs/zeta/.DS_Store differ diff --git a/docs/zeta/index.md b/docs/zeta/index.md new file mode 100644 index 0000000..6ae54af --- /dev/null +++ b/docs/zeta/index.md @@ -0,0 +1,59 @@ +The Zeta framework provides developers with the ability to create State of The Art Models as simply and seamlessly as possible through **Modularity**, **Reliability**, **Use-Ability**, and **Speed** + +Zeta not only helps developers harness the potential of LLMs and Multi-Modal Foundation Models but also enforces trust boundaries, schema validation, and tool activity-level permissions. By doing so, Zeta maximizes LLMsโ€™ reasoning while adhering to strict policies regarding their capabilities. + +Zetaโ€™s design philosophy is based on the following tenets: + +1. **Use-Ability**: Utilizing Zeta should feel like going for a swim in the ocean, seamless and fluid with pythonic methods and classes and error handling that signifies what steps to take next. +2. **Reliability**: Zeta puts every FLOP to work by harnessing ultra-reliable and high-performance designs for all functions and classes +3. **Speed**: Zeta is like the Lamborghini of ML Frames with simply unparalled speed. + +## Quick Starts + +### Using pip + +Install **zeta** + +``` +pip3 install zeta +``` + +## Unleash FlashAttention +With Zeta, you can unleash the best and highest performance attention mechanisms like `FlashAttention` and `MultiQueryAttention`, here's an example with Flash Attention + +```python +import torch +from zeta import FlashAttention + +q = torch.randn(2, 4, 6, 8) +k = torch.randn(2, 4, 10, 8) +v = torch.randn(2, 4, 10, 8) + +attention = FlashAttention(causal=False, dropout=0.1, flash=False) +output = attention(q, k, v) + +print(output.shape) +``` + +## Unleash GPT-4 +On top of the SOTA Attention mechanisms we provide, we also provide rough implementation of some of the best neural nets ever made like `GPT4`, here's an example on how to utilize our implementation of GPT-4 + +```python +import torch +from zeta import GPT4, GPT4MultiModal + +#text +text = torch.randint(0, 256, (1, 1024)).cuda() +img = torch.randn(1, 3, 256, 256) + +gpt4_language = GPT4() + +gpt4_language(x) + +#multimodal GPT4 + +gpt4_multimodal = GPT4MultiModal() +gpt4_multimodal_output = gpt4_multimodal(text, img) + +``` + diff --git a/docs/zeta/nn/architecture/decoder.md b/docs/zeta/nn/architecture/decoder.md new file mode 100644 index 0000000..3fcf811 --- /dev/null +++ b/docs/zeta/nn/architecture/decoder.md @@ -0,0 +1,103 @@ +# Decoder Class Documentation + +Module/Class Name: Decoder + +```python +class Decoder(AttentionLayers): + def __init__(self, **kwargs): + assert 'causal' not in kwargs, 'cannot set causality on decoder' + super().__init__(causal=True, **kwargs) +``` + +## Overview and Introduction + +The `Decoder` class is a component of the Zeta library designed for creating a decoder model with multiple attention layers. It extends the functionality of the `AttentionLayers` class to enable the construction of a decoder architecture. The decoder is a key component in various sequence-to-sequence tasks, such as machine translation, text generation, and more. + +The decoder employs multi-head self-attention mechanisms and feed-forward networks to transform input sequences into meaningful output sequences while maintaining the causal property. It is particularly suitable for autoregressive tasks, where each step depends only on previous steps in the sequence. + +## Class Definition + +```python +class Decoder(AttentionLayers): + def __init__(self, **kwargs): + assert 'causal' not in kwargs, 'cannot set causality on decoder' + super().__init__(causal=True, **kwargs) +``` + +The `Decoder` class inherits from the `AttentionLayers` class and introduces the causality constraint by setting `causal=True`. It is initialized with various parameters that configure the architecture and behavior of the decoder. + +## Parameters + +The `Decoder` class constructor accepts various parameters that control the behavior of the decoder. The most important parameters are inherited from the `AttentionLayers` class, and additional parameters specific to the decoder are introduced. Below is a summary of the parameters: + +- `dim` (int): Dimensionality of the model. +- `depth` (int): Number of decoder layers. +- `heads` (int): Number of parallel attention heads. +- `cross_attend` (bool): Enable cross-attention between input and output sequences. +- `sandwich_coef` (int): Coefficient for configuring sandwich normalization. +- `residual_attn` (bool): Enable residual connection for self-attention layers. +- `cross_residual_attn` (bool): Enable residual connection for cross-attention layers. +- `layer_dropout` (float): Dropout probability applied to each layer. +- ... (additional parameters inherited from `AttentionLayers`) + +## Functionality and Usage + +The `Decoder` class extends the functionality of the `AttentionLayers` class to specifically create decoder models. It employs multi-head self-attention mechanisms and feed-forward networks to process input sequences and generate output sequences. + +### Initialization + +To create a decoder instance, you can use the following code: + +```python +from zeta import Decoder + +decoder = Decoder( + dim=512, + depth=6, + heads=8, + causal=True, + cross_attend=True, + residual_attn=True, + layer_dropout=0.1 +) +``` + +### Forward Pass + +The forward pass of the decoder can be performed using the following code: + +```python +output = decoder(input_sequence, context=context_sequence, mask=mask_sequence, context_mask=context_mask_sequence) +``` + +Here, `input_sequence` represents the input sequence to the decoder, `context_sequence` represents the context sequence for cross-attention (if enabled), `mask_sequence` is an optional mask to ignore certain elements in the input, and `context_mask_sequence` is an optional mask for the context sequence. + +### Return Intermediates + +If desired, you can also obtain intermediate outputs at each layer using the `return_hiddens` parameter: + +```python +output, intermediates = decoder(input_sequence, context=context_sequence, mask=mask_sequence, context_mask=context_mask_sequence, return_hiddens=True) +``` + +The `intermediates` object will contain information about intermediate hidden states and attention outputs for each layer. + +## Mathematical Formula + +The `Decoder` class is built upon the foundation of multi-head self-attention and feed-forward networks. It can be summarized using the following mathematical formula: + +1. Input Embedding: \( X \) +2. Multi-Head Self-Attention: \( A = \text{MultiHeadAttention}(X) \) +3. Feed-Forward Network: \( Y = \text{FeedForward}(A) \) +4. Residual Connection: \( Z = X + Y \) + +The above formula represents the basic forward pass of each layer in the decoder. The decoder iteratively applies these operations across its layers to generate meaningful output sequences while maintaining causal dependencies. + +## References + +- [Zeta Library Documentation](https://example.com/zeta/docs) +- [Attention Is All You Need](https://arxiv.org/abs/1706.03762) +- [PAR: Prompted Attention](https://arxiv.org/abs/2207.04503) +``` + +This documentation provides an in-depth overview of the `Decoder` class in the Zeta library. It covers its purpose, parameters, usage examples, and includes a simplified mathematical formula to illustrate its functionality. \ No newline at end of file diff --git a/docs/zeta/nn/architecture/transformer.md b/docs/zeta/nn/architecture/transformer.md new file mode 100644 index 0000000..c2bb20a --- /dev/null +++ b/docs/zeta/nn/architecture/transformer.md @@ -0,0 +1,140 @@ +# Transformer Documentation + +## Overview + +The `Transformer` class in the Zeta library is a versatile deep learning architecture that combines attention mechanisms with feedforward neural networks for various natural language processing tasks, such as language modeling, machine translation, and text generation. The Transformer architecture was introduced in the paper "Attention is All You Need" by Vaswani et al. + +The main purpose of the `Transformer` class is to provide a flexible and configurable interface for creating transformer-based models for sequence-to-sequence tasks. The class allows users to specify the number of tokens, maximum sequence length, attention layers, embeddings, and other parameters necessary for creating and training transformer models. + +The Transformer class supports both autoregressive and non-autoregressive training settings and includes features such as relative positional biases, rotary positional embeddings, memory tokens, and more. + +## Class Signature + +```python +class Transformer(nn.Module): + def __init__( + self, + *, + num_tokens, + max_seq_len, + attn_layers, + embedding_provider: BaseEmbedding, + emb_dim = None, + max_mem_len = 0., + shift_mem_down = 0, + emb_dropout = 0., + post_emb_norm = False, + num_memory_tokens = None, + tie_embedding = False, + logits_dim = None, + use_abs_pos_emb = True, + scaled_sinu_pos_emb = False, + l2norm_embed = False, + emb_frac_gradient = 1. + ) +``` + +## Parameters + +- `num_tokens` (int): The total number of tokens in the vocabulary. +- `max_seq_len` (int): The maximum length of the input sequences. +- `attn_layers` (AttentionLayers): An instance of the `AttentionLayers` class representing the core attention layers of the transformer. +- `embedding_provider` (BaseEmbedding): An instance of the `BaseEmbedding` class providing token embeddings. +- `emb_dim` (int, optional): The embedding dimension. Default is `None`, in which case `emb_dim` is set to the same dimension as the `attn_layers`. +- `max_mem_len` (float, optional): Maximum memory length for memory tokens. Default is `0.0`, indicating no memory tokens. +- `shift_mem_down` (int, optional): Number of positions to shift memory tokens down in each layer. Default is `0`. +- `emb_dropout` (float, optional): Dropout rate applied to the embedding layer. Default is `0.0`. +- `post_emb_norm` (bool, optional): Apply layer normalization to the post-embedding inputs. Default is `False`. +- `num_memory_tokens` (int, optional): Number of memory tokens to use. Default is `None`, indicating no memory tokens. +- `tie_embedding` (bool, optional): Tie the output projection weights with the input token embeddings. Default is `False`. +- `logits_dim` (int, optional): Dimensionality of the output logits. Default is `None`, indicating that it's the same as `num_tokens`. +- `use_abs_pos_emb` (bool, optional): Use absolute positional embeddings. Default is `True`. +- `scaled_sinu_pos_emb` (bool, optional): Use scaled sinusoidal positional embeddings. Default is `False`. +- `l2norm_embed` (bool, optional): Apply L2 normalization to the embeddings. Default is `False`. +- `emb_frac_gradient` (float, optional): Fraction of the gradient that should go to the embedding. Default is `1.0`. + +## Methods + +### `forward` + +```python +def forward( + self, + x, + return_embeddings = False, + return_logits_and_embeddings = False, + return_intermediates = False, + mask = None, + return_mems = False, + return_attn = False, + mems = None, + pos = None, + prepend_embeds = None, + sum_embeds = None, + **kwargs +) +``` + +This method computes the forward pass of the transformer. + +#### Parameters + +- `x` (torch.Tensor): Input tensor representing the sequence of token indices. +- `return_embeddings` (bool, optional): If `True`, return only the embeddings without applying the output projection. Default is `False`. +- `return_logits_and_embeddings` (bool, optional): If `True`, return both the logits and embeddings. Default is `False`. +- `return_intermediates` (bool, optional): If `True`, return intermediate attention values. Default is `False`. +- `mask` (torch.Tensor, optional): Attention mask indicating positions to be masked. Default is `None`. +- `return_mems` (bool, optional): If `True`, return updated memory tokens. Default is `False`. +- `return_attn` (bool, optional): If `True`, return attention maps. Default is `False`. +- `mems` (list of torch.Tensor, optional): Memory tokens for each layer. Default is `None`. +- `pos` (torch.Tensor, optional): External positional embeddings. Default is `None`. +- `prepend_embeds` (torch.Tensor, optional): Prepend embeddings to the input sequence. Default is `None`. +- `sum_embeds` (torch.Tensor, optional): Sum external embeddings to the input sequence. Default is `None`. +- `kwargs`: Additional keyword arguments passed to the attention layers. + +#### Returns + +The method returns the output logits or embeddings based on the specified return options. + +## Usage Examples + +Here are three usage examples of the `Transformer` class from the Zeta library: + +```python +from zeta.nn import Transformer + +# Example 1: Basic Usage +transformer = Transformer( + num_tokens=10000, + max_seq_len=256, + attn_layers=attn_layers_instance, + embedding_provider=embedding_provider_instance +) +logits = transformer(input_tokens) + +# Example 2: Return Embeddings +embeddings = transformer(input_tokens, return_embeddings=True) + +# Example 3: Return Intermediate Attention Maps +logits, attn_maps = transformer(input_tokens, return_attn=True) +``` + +In these examples, replace `attn_layers_instance` and `embedding_provider_instance` with actual instances of `AttentionLayers` and `BaseEmbedding`, respectively, and `input_tokens` with your input tensor containing token indices. + +## Mathematical Formula + +The mathematical formula for the `Transformer` class can be represented as follows: + +``` +Input -> Embedding -> Post-embedding Norm -> Embedding Dropout -> Project Embedding -> Attention Layers -> Layer Normalization -> To Logits/Embeddings +``` + +In this formula, "Attention Layers" represents the core attention mechanism of the transformer, which includes self-attention and feedforward neural networks. + +## References + +- Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., ... & Polosukhin, I. (2017). Attention is All You Need. Advances in neural information processing systems, 30. +- Zeta Library: Link to the official documentation of the Zeta library. +- Insert any additional references or resources as needed. +``` + diff --git a/docs/zeta/nn/attention/base.md b/docs/zeta/nn/attention/base.md new file mode 100644 index 0000000..41dfe49 --- /dev/null +++ b/docs/zeta/nn/attention/base.md @@ -0,0 +1,90 @@ +# BaseAttention Abstract Class +============================ + +Theย `BaseAttention`ย class is an abstract base class that defines the interface for all attention mechanisms. It includes the basic structure and methods that all attention mechanisms should have. + +```python +from abc import abstractmethod +import torch.nn as nn + +class BaseAttention(nn.Module): + @abstractmethod + def __init__(self, dim): + super().__init__() + self.dim = dim + + + @abstractmethod + def forward(self, x, context=None, mask=None): + pass +``` + + +## Usage +----------------------- + +Theย `FlashAttentionTwo`ย class extends theย `BaseAttention`ย abstract base class and implements the specific attention mechanism. + +```python +class FlashAttentionTwo(BaseAttention): + def __init__( + self, + *, + dim, + heads = 8, + dim_head = 64, + causal = False, + q_bucket_size = 512, + k_bucket_size = 1024, + parallel = False, + mixed_precision = False + ): + super().__init__(dim, heads, dim_head) + self.causal = causal + self.parallel = parallel + self.mixed_precision = mixed_precision + self.q_bucket_size = q_bucket_size + self.k_bucket_size = k_bucket_size + # ... rest of the implementation ... + + def forward( + self, + x, + context = None, + mask = None, + q_bucket_size = None, + k_bucket_size = None, + ): + # ... implementation of the forward method ... +``` + + +## Rules for Using the BaseAttention Class +--------------------------------------- + +1. Any class that extends theย `BaseAttention`ย class must implement theย `forward`ย method. This method defines how the attention mechanism operates. + +2. Theย `__init__`ย method of theย `BaseAttention`ย class takes three parameters:ย `dim`,ย `heads`, andย `dim_head`. Any class that extendsย `BaseAttention`ย should pass these parameters to theย `__init__`ย method of the base class. + +3. Theย `forward`ย method of theย `BaseAttention`ย class takes three parameters:ย `x`,ย `context`, andย `mask`. Any class that extendsย `BaseAttention`ย should include these parameters in itsย `forward`ย method. + +--- + +## Example of Using the FlashAttentionTwo Class +-------------------------------------------- + +```python +from zeta import FlashAttentionTwo + +# Create an instance of the FlashAttentionTwo class +attention = FlashAttentionTwo(dim=512, heads=8, dim_head=64) + +# Create some input data +x = torch.randn(1, 10, 512) + +# Apply the attention mechanism +out = attention(x) +``` + + +In this example, we first create an instance of theย `FlashAttentionTwo`ย class. We then create some input dataย `x`ย and apply the attention mechanism to this data by calling theย `forward`ย method of theย `attention`ย instance. \ No newline at end of file diff --git a/docs/zeta/nn/attention/flash2.md b/docs/zeta/nn/attention/flash2.md new file mode 100644 index 0000000..d1fbee2 --- /dev/null +++ b/docs/zeta/nn/attention/flash2.md @@ -0,0 +1,155 @@ +# Module Name: FlashAttentionTwo + +Theย `FlashAttentionTwo`ย class is a PyTorch module that implements a variant of the attention mechanism, which is a key component in many state-of-the-art models in natural language processing and other fields. This class is designed to be memory-efficient and optionally supports parallel computation and mixed precision for improved performance. + +## Class Definition +---------------- + +```python +class FlashAttentionTwo(nn.Module): + def __init__( + self, + *, + dim, + heads = 8, + dim_head = 64, + causal = False, + q_bucket_size = 512, + k_bucket_size = 1024, + parallel = False, + mixed_precision = False + ): +``` + +--- + +### Parameters + +- `dim`ย (int): The dimensionality of the input data. +- `heads`ย (int, optional): The number of attention heads. Default is 8. +- `dim_head`ย (int, optional): The dimensionality of each attention head. Default is 64. +- `causal`ย (bool, optional): If True, the attention mechanism is causal. Default is False. +- `q_bucket_size`ย (int, optional): The bucket size for the query in the attention mechanism. Default is 512. +- `k_bucket_size`ย (int, optional): The bucket size for the key in the attention mechanism. Default is 1024. +- `parallel`ย (bool, optional): If True, the computation is performed in parallel across multiple GPUs. Default is False. +- `mixed_precision`ย (bool, optional): If True, the computation is performed in mixed precision for improved performance. Default is False. + +----- + +### Methods + +#### `forward` + +``` +def forward( + self, + x, + context = None, + mask = None, + q_bucket_size = None, + k_bucket_size = None, +): +``` + +Performs the forward pass of the attention mechanism. + +##### Parameters + +- `x`ย (Tensor): The input data. +- `context`ย (Tensor, optional): The context for the attention mechanism. If not provided, the input dataย `x`ย is used as the context. +- `mask`ย (Tensor, optional): An optional mask for the attention mechanism. +- `q_bucket_size`ย (int, optional): The bucket size for the query in the attention mechanism. If not provided, the value specified during initialization is used. +- `k_bucket_size`ย (int, optional): The bucket size for the key in the attention mechanism. If not provided, the value specified during initialization is used. + +--- + +##### Returns + +- `out`ย (Tensor): The output of the attention mechanism. + + +## Usage Examples +-------------- + +### Example 1: Basic Usage + +```python +from torch import nn +from zeta import FlashAttentionTwo + +model = FlashAttentionTwo(dim=512) +x = torch.randn(1, 10, 512) +out = model(x) +``` + +Copy code + +### Example 2: Using a Mask + +```python +from torch import nn +from zeta import FlashAttentionTwo + +model = FlashAttentionTwo(dim=512) +x = torch.randn(1, 10, 512) +mask = torch.ones(1, 10) +out = model(x, mask=mask) +``` + +---- + +### Example 3: Using a Context + +```python +from torch import nn +from zeta import FlashAttentionTwo + +model = FlashAttentionTwo(dim=512) +x = torch.randn(1, 10, 512) +context = torch.randn(1, 10, 512) +out = model(x, context=context) +``` + + +## Mathematical Formula +-------------------- + +The attention mechanism can be described by the following formula: + +![Attention Formula](https://wikimedia.org/api/rest_v1/media/math/render/svg/0de1e8f5c8f6e3c3e1f8b3c89a6a2b7b187a5d3f) + +where Q, K, and V are the query, key, and value, respectively. The softmax function ensures that the weights sum to 1, and the dot product of the weights and the value gives the output of the attention mechanism. + + +### Additional Information +---------------------- + +Theย `FlashAttentionTwo`ย class is designed to be memory-efficient and optionally supports parallel computation and mixed precision for improved performance. + +- Theย `parallel`ย parameter allows the computation to be performed in parallel across multiple GPUs. This can significantly speed up the computation for large models or large datasets. + +- Theย `mixed_precision`ย parameter allows the computation to be performed in mixed precision. This means that some operations are performed in lower precision (e.g., float16) and some in higher precision (e.g., float32). This can significantly speed up the computation and reduce memory usage on modern GPUs that support mixed precision. + +- Theย `q_bucket_size`ย andย `k_bucket_size`ย parameters control the bucket size for the query and key in the attention mechanism, respectively. These parameters can be used to trade off between memory usage and computational efficiency. Larger bucket sizes can be more memory-efficient but may also be slower. + +### Common Issues +------------- + +- If you encounter out-of-memory errors, you can try reducing theย `q_bucket_size`ย andย `k_bucket_size`ย parameters, or enabling mixed precision computation by settingย `mixed_precision=True`. + +- If you encounter slow computation, you can try increasing theย `q_bucket_size`ย andย `k_bucket_size`ย parameters, or enabling parallel computation by settingย `parallel=True`ย (if you have multiple GPUs available). + +### References and Resources +------------------------ + +- [Attention Is All You Need](https://arxiv.org/abs/1706.03762): This is the original paper that introduced the concept of attention in deep learning. + +- [PyTorch Documentation](https://pytorch.org/docs/stable/index.html): The official PyTorch documentation provides detailed information about the PyTorch library and its modules. + +- [Efficient Attention: Attention with Linear Complexities](https://arxiv.org/abs/1812.01243): This paper introduces the concept of bucketing in the attention mechanism to improve memory efficiency. + +- [Mixed Precision Training](https://arxiv.org/abs/1710.03740): This paper introduces the concept of mixed precision training, which can significantly speed up computation and reduce memory usage on modern GPUs. + +- [PyTorch Tutorials](https://pytorch.org/tutorials/): The official PyTorch tutorials provide many examples of how to use PyTorch for various tasks. + +- \ No newline at end of file diff --git a/docs/zeta/nn/attention/flash_attention.md b/docs/zeta/nn/attention/flash_attention.md new file mode 100644 index 0000000..27c06fb --- /dev/null +++ b/docs/zeta/nn/attention/flash_attention.md @@ -0,0 +1,105 @@ +# FlashAttention + +The FlashAttention module performs efficient attention computations, specifically designed for leveraging hardware capabilities on certain NVIDIA GPUs. It offers the option to perform "flash" attention which can be computationally faster on specific GPU architectures. + +--- + +## Class Definition: + +```python +class FlashAttention(nn.Module): +``` + +### Parameters: + +- `causal` (bool, optional): Determines whether to apply causal masking. Default: False. +- `dropout` (float, optional): Dropout probability. Default: 0. +- `flash` (bool, optional): Whether to use flash attention. Requires PyTorch version 2.0 or above. Default: True. + +--- + +## Methods: + +### `__init__(self, causal=False, dropout=0., flash=True)` + +Initializes the FlashAttention module. + +### `get_mask(self, i, j, device)` + +Generates a mask for attention computation. + +#### Parameters: +- `i` (int): Length of the query sequence. +- `j` (int): Length of the key sequence. +- `device` (torch.device): Device to place the mask tensor. + +#### Returns: +- `torch.Tensor`: Mask tensor of shape `(i, j)`. + +### `flash_attn(self, q, k, v, mask=None, attn_bias=None)` + +Performs flash attention computation. + +#### Parameters: +- `q` (torch.Tensor): Query tensor of shape `(batch, heads, q_len, dim)`. +- `k` (torch.Tensor): Key tensor of shape `(batch, heads, k_len, dim)`. +- `v` (torch.Tensor): Value tensor of shape `(batch, heads, v_len, dim)`. +- `mask` (torch.Tensor, optional): Mask tensor of shape `(batch, heads, q_len, k_len)`. Default: None. +- `attn_bias` (torch.Tensor, optional): Attention bias tensor of shape `(batch, heads, q_len, k_len)`. Default: None. + +#### Returns: +- `torch.Tensor`: Output tensor of shape `(batch, heads, q_len, dim)`. + +### `forward(self, q, k, v, mask=None, attn_bias=None)` + +Performs the attention computation using einstein notation. + +#### Parameters: +- `q` (torch.Tensor): Query tensor of shape `(batch, heads, q_len, dim)`. +- `k` (torch.Tensor): Key tensor of shape `(batch, heads, k_len, dim)`. +- `v` (torch.Tensor): Value tensor of shape `(batch, heads, v_len, dim)`. +- `mask` (torch.Tensor, optional): Mask tensor of shape `(batch, heads, q_len, k_len)`. Default: None. +- `attn_bias` (torch.Tensor, optional): Attention bias tensor of shape `(batch, heads, q_len, k_len)`. Default: None. + +#### Returns: +- `torch.Tensor`: Attention output tensor. + +--- + +## Usage Examples: + +1. **Basic Usage**: +```python +from zeta.nn import FlashAttention +attn_module = FlashAttention() +output = attn_module(query_tensor, key_tensor, value_tensor) +``` + +2. **Using Flash Attention with Masking**: +```python +from zeta.nn import FlashAttention +attn_module = FlashAttention(flash=True) +mask = attn_module.get_mask(query_length, key_length, device) +output = attn_module(query_tensor, key_tensor, value_tensor, mask=mask) +``` + +3. **Using Causal Flash Attention with Dropout**: +```python +from zeta.nn import FlashAttention +attn_module = FlashAttention(causal=True, dropout=0.1, flash=True) +output = attn_module(query_tensor, key_tensor, value_tensor) +``` + +--- + +## Additional Tips: + +- The `FlashAttention` module is optimized for NVIDIA A100 GPUs. On these GPUs, using `flash=True` is recommended for faster computation. +- Ensure that PyTorch version is 2.0 or above when enabling flash attention. +- The mask generated using `get_mask` method is useful for attention computations where certain positions need to be masked out. + +--- + +## References: + +- Original Attention Mechanism: [Attention Is All You Need](https://arxiv.org/abs/1706.03762) \ No newline at end of file diff --git a/docs/zeta/nn/attention/multihead.md b/docs/zeta/nn/attention/multihead.md new file mode 100644 index 0000000..6646190 --- /dev/null +++ b/docs/zeta/nn/attention/multihead.md @@ -0,0 +1,106 @@ +# Multihead Attention Documentation for Zeta Library + +## Introduction + +`MultiheadAttention` is a module in the Zeta library that provides multi-head attention mechanism. This mechanism enables the model to focus on different parts of the input sequence simultaneously. It's widely used in models such as transformers for capturing various aspects of information in the input. + +## Purpose + +The purpose of the `MultiheadAttention` module is to allow joint information representation from different subspaces of the input sequence. This results in capturing a richer context when modeling sequences. + +## Architecture + +The `MultiheadAttention` class extends from the `nn.Module` base class. Internally, it uses linear transformations for keys, values, and queries (`k_proj`, `v_proj`, `q_proj`). These projections are wrapped using the `MultiwayWrapper`. It also utilizes layer normalization (`inner_attn_ln`) and optionally uses relative positional embeddings (`xpos`). + +## Class Definition + +```python +class zeta.nn.embeddings.MultiheadAttention(nn.Module): +``` + +### Parameters: +- `args`: General arguments passed for configuring the module. +- `embed_dim` (int): Total dimension of the model. +- `num_heads` (int): Number of parallel attention heads. The embed_dim will be split across num_heads. +- `dropout` (float): Dropout probability. Default: 0.0. +- `self_attention` (bool): Whether to apply self attention. Only one of `self_attention` or `encoder_decoder_attention` can be True. Default: False. +- `encoder_decoder_attention` (bool): Whether to apply encoder-decoder attention. Only one of `self_attention` or `encoder_decoder_attention` can be True. Default: False. +- `subln` (bool): If True, applies layer normalization after self attention. Default: False. + +### Methods: + +#### `reset_parameters()` +Reinitialize the parameters of the attention module. + +#### `forward(query, key, value, ...)` +Computes the forward pass of the attention mechanism. + +- Parameters: + - `query` (Tensor): The query tensor. + - `key` (Tensor): The key tensor. + - `value` (Tensor): The value tensor. + - Other arguments including `incremental_state`, `key_padding_mask`, `attn_mask`, `rel_pos`, and `is_first_step`. + +- Returns: + - `attn` (Tensor): The computed attention tensor. + - `attn_weights` (Tensor): The attention weights. + +### Mathematical Formulation: + +Given a query \( Q \), key \( K \), and value \( V \), the multihead attention mechanism is mathematically represented as: + +\[ \text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right) V \] + +Where \( d_k \) is the dimension of the key. + +## Usage Examples: + +### Example 1: Basic Usage + +```python +from zeta.nn.embeddings import MultiheadAttention +import torch + +args = ... # Some configuration +attention = MultiheadAttention(args, embed_dim=512, num_heads=8, dropout=0.1, self_attention=True) +query = torch.rand((32, 10, 512)) +key = torch.rand((32, 10, 512)) +value = torch.rand((32, 10, 512)) + +attn, attn_weights = attention(query, key, value) +``` + +### Example 2: With Masking + +```python +from zeta.nn.embeddings import MultiheadAttention +import torch + +args = ... # Some configuration +attention = MultiheadAttention(args, embed_dim=512, num_heads=8, dropout=0.1, self_attention=True) +query = torch.rand((32, 10, 512)) +key = torch.rand((32, 10, 512)) +value = torch.rand((32, 10, 512)) +attn_mask = torch.ones((10, 10)).triu_() * -1e9 # Upper triangular mask + +attn, attn_weights = attention(query, key, value, attn_mask=attn_mask) +``` + +### Example 3: Encoder-Decoder Attention + +```python +from zeta.nn.embeddings import MultiheadAttention +import torch + +args = ... # Some configuration +attention = MultiheadAttention(args, embed_dim=512, num_heads=8, dropout=0.1, encoder_decoder_attention=True) +query = torch.rand((32, 10, 512)) # Decoder query +key = torch.rand((32, 20, 512)) # Encoder key +value = torch.rand((32, 20, 512)) # Encoder value + +attn, attn_weights = attention(query, key, value) +``` + +## Additional Tips: +- For encoder-decoder attention, make sure the dimensions of the encoder and decoder tensors match the expected input sizes. +- Using masks can be helpful to prevent the attention mechanism from focusing on certain parts of the sequence, such as padding. diff --git a/docs/zeta/nn/attention/multiquery.md b/docs/zeta/nn/attention/multiquery.md new file mode 100644 index 0000000..68fc46e --- /dev/null +++ b/docs/zeta/nn/attention/multiquery.md @@ -0,0 +1,108 @@ +# MultiQueryAttention + +## Overview and Introduction: + +The `MultiQueryAttention` class is a part of the Zeta library, designed to perform self-attention operations on given input data. Unlike traditional attention mechanisms that use a single query, this class leverages multiple queries to capture a broader range of context information. This class allows for various implementations of attention, including Flash, Triton, and Torch. It also provides the flexibility to choose normalization type, fully connected layer type, and offers debugging verbosity. + +## Class Definition: + +```python +class MultiQueryAttention(nn.Module): + """Multi-Query self attention. + Using torch or triton attention implementation enables the user to also use + additive bias. + """ +``` + +### Parameters: +- `d_model` (int): Dimension of the model. +- `heads` (int): Number of parallel attention heads. +- `attn_impl` (str, optional): Attention implementation type, can be either 'triton', 'flash', or 'torch'. Default is 'triton'. +- `clip_qkv` (Optional[float]): Clipping value for query, key, and value. If specified, qkv is clamped within the range [-clip_qkv, clip_qkv]. +- `qk_ln` (bool, optional): If True, layer normalization is applied to query and key. +- `softmax_scale` (Optional[float]): Scale for softmax. Default value is computed as 1/sqrt(head_dim). +- `attn_pdrop` (float, optional): Attention dropout probability. Default is 0.0. +- `norm_type` (str, optional): Normalization type, default is 'low_precision_layernorm'. +- `fc_type` (str, optional): Fully connected layer type, default is 'torch'. +- `verbose` (int, optional): Verbosity level, default is 0. +- `device` (Optional[str]): Device to which the tensors should be moved. + +## Functionality and Usage: + +The `MultiQueryAttention` class operates by using multiple queries to capture broader context information from given data. This is achieved through the forward method which computes the self-attention on the given inputs. + +### Method: `forward` +```python +def forward( + self, + x, + past_key_value=None, + bias=None, + mask=None, + causal=True, + needs_weights=False, +): +``` + +#### Parameters: + +- `x` (Tensor): Input tensor. +- `past_key_value` (Optional): Past key and value for attention computation. Default is None. +- `bias` (Optional): Additive bias for attention scores. Default is None. +- `mask` (Optional): Key padding mask. Default is None. +- `causal` (bool, optional): If True, a causal mask is applied to prevent information flow from future tokens. Default is True. +- `needs_weights` (bool, optional): If True, attention weights are also returned. Default is False. + +#### Returns: + +- `context` (Tensor): Contextualized tensor after attention computation. +- `attn_weights` (Tensor, Optional): Attention weights. Only returned if `needs_weights` is True. +- `past_key_value` (Tensor, Optional): New past key and value. + +## Usage Examples: + +1. Basic Usage: +```python +from zeta import MultiQueryAttention +import torch + +# Initialize the attention module +attention_layer = MultiQueryAttention(d_model=512, heads=8, attn_impl='torch') + +# Random input tensor +x = torch.rand(16, 10, 512) # Batch of 16, sequence length 10, embedding size 512 +output, attn_weights, _ = attention_layer(x) +``` + +2. Using Past Key and Value: +```python +past_key_value = (torch.rand(16, 8, 10, 64), torch.rand(16, 8, 10, 64)) # Past key and value for 8 heads +output, attn_weights, new_past_key_value = attention_layer(x, past_key_value=past_key_value) +``` + +3. With Causal Masking and Weights: +```python +output, attn_weights, _ = attention_layer(x, causal=True, needs_weights=True) +``` + +## Mathematical Formula: + +For the self-attention mechanism, the computation involves using multiple queries (\( Q \)), keys (\( K \)), and values (\( V \)): + +```latex +\[ \text{Attention}(Q, K, V) = \text{Softmax}\left(\frac{Q \times K^T}{\sqrt{d_k}} + \text{Bias}\right) \times V \] +``` +Where: +- \( Q \), \( K \), and \( V \) are the queries, keys, and values respectively. +- \( d_k \) is the dimension of the keys. +- Bias is the optional additive bias. + +## Additional Information and Tips: + +- It's crucial to select the correct attention implementation (`attn_impl`) based on your needs and the hardware you're running on. +- The `triton` implementation might be faster than `flash` but can use more memory. Ensure that you have adequate GPU memory if using `triton`. +- If using the `torch` implementation, it's advisable to check if CUDA is available for GPU acceleration. +- The clipping of qkv (`clip_qkv`) can be beneficial for stability in training. + +## References and Resources: +For a deeper understanding of the self-attention mechanism and its variants, you can refer to the "Attention is All You Need" paper by Vaswani et al., 2017. \ No newline at end of file diff --git a/docs/zeta/nn/biases/alibi.md b/docs/zeta/nn/biases/alibi.md new file mode 100644 index 0000000..3f93dbe --- /dev/null +++ b/docs/zeta/nn/biases/alibi.md @@ -0,0 +1,90 @@ +# AlibiPositionalBias Documentation + +## Introduction + +The `AlibiPositionalBias` module belongs to the zeta library and plays a crucial role in handling positional bias for multi-head attention mechanisms. Specifically, it attempts to alleviate the absolute positional bias based on the number of attention heads. + +## Class Definition: + +```python +class AlibiPositionalBias(nn.Module): +``` + +### Parameters: +- **heads** (`int`): Number of attention heads for which the slopes need to be calculated. +- **total_heads** (`int`): Total number of attention heads in the network. + +### Attributes: +- **slopes** (`Tensor`): Tensor containing slope values, which are computed based on the number of heads. +- **bias** (`Tensor` or `None`): Tensor for storing positional bias values. If not initialized or needs recomputation, it would be None. + +### Methods: +#### `__init__(self, heads, total_heads, **kwargs) -> None`: +Initializes the `AlibiPositionalBias` module. + +#### `get_bias(self, i, j, device) -> Tensor`: +Computes the positional bias for given dimensions i and j. + +- **Parameters**: + - **i** (`int`): One dimension of the required positional bias. + - **j** (`int`): Second dimension of the required positional bias. + - **device** (`torch.device`): The device on which computations are to be performed. + +#### `_get_slopes(heads) -> List[float]`: +A static method that calculates slopes based on the number of attention heads. + +- **Parameters**: + - **heads** (`int`): Number of attention heads. + +#### `forward(self, i, j) -> Tensor`: +Computes or retrieves the bias tensor for given dimensions. + +- **Parameters**: + - **i** (`int`): One dimension for the required positional bias. + - **j** (`int`): Second dimension for the required positional bias. + +## Mathematical Formula: + +Given `n` attention heads, the alibi positional bias can be represented as: + +\[ \text{Bias} = \text{-abs}(j_{\text{range}}) \times \text{slope} \] + +Where: +- \( j_{\text{range}} \) is an array of numbers from `0` to `j-1`. +- `slope` is computed based on the number of heads using `_get_slopes` method. + +## Usage Examples: + +### Example 1: Initialize and compute bias +```python +from zeta import AlibiPositionalBias +import torch + +bias_module = AlibiPositionalBias(heads=4, total_heads=8) +bias = bias_module(10, 10) +print(bias) +``` + +### Example 2: Retrieve stored bias +```python +bias = bias_module(5, 5) +print(bias) +``` + +### Example 3: Computing bias for different dimensions +```python +bias = bias_module(8, 15) +print(bias) +``` + +## Note: + +- It's crucial to ensure that the `total_heads` parameter is always greater than or equal to the `heads` parameter during initialization. +- The device property is internally used to determine the computation device based on the registered buffers. + +## References: + +For a deeper understanding and applications of positional bias in attention mechanisms, one may refer to the foundational paper on Transformer architectures: +- [Attention Is All You Need](https://arxiv.org/abs/1706.03762) + +Also, the `einops` library provides a versatile interface for tensor manipulations. More details can be found at its official [documentation](https://einops.rocks/). \ No newline at end of file diff --git a/docs/zeta/nn/biases/relative_bias.md b/docs/zeta/nn/biases/relative_bias.md new file mode 100644 index 0000000..b3d0ec6 --- /dev/null +++ b/docs/zeta/nn/biases/relative_bias.md @@ -0,0 +1,81 @@ +# RelativePositionBias + +`RelativePositionBias` is a specialized PyTorch module designed to generate relative position biases, which can be vital for certain attention mechanisms in deep learning architectures. This module quantizes the distance between two positions into a certain number of buckets and then uses an embedding to get the relative position bias. This mechanism aids in the attention mechanism by providing biases based on relative positions between the query and key, rather than relying solely on their absolute positions. + +## Architecture: +The architecture can be visualized in three major steps: +1. **Bucketing:** Convert relative distances between two positions into bucket indices. +2. **Embedding:** Use the bucket indices to get embeddings for each pair of positions. +3. **Computing Bias:** Computes the bias values based on the embeddings. + +## Purpose: +In the context of attention mechanisms, especially the transformer-based architectures, the position of tokens can provide valuable information. The `RelativePositionBias` class helps introduce this information in a compact form by bucketing relative positions and then embedding them to serve as biases for the attention scores. + +## Mathematical Formula: +Given a relative position \( r \), the bucket index \( b \) is computed as: +\[ b = +\begin{cases} + n + \text{num_buckets} \div 2 & \text{if } n < 0 \text{ and bidirectional is True} \\ + \min\left( \max_{\text{exact}} + \left(\frac{\log(\frac{n}{\max_{\text{exact}}})}{\log(\frac{\text{max_distance}}{\max_{\text{exact}}})} \times (\text{num_buckets} - \max_{\text{exact}})\right), \text{num_buckets} - 1 \right) & \text{otherwise} + \end{cases} +\] +Where \( n \) is the negative of the relative position, and \( \max_{\text{exact}} \) is \( \text{num_buckets} \div 2 \). + +## Class Definition: + +```python +class RelativePositionBias(nn.Module): + """ + Compute relative position bias which can be utilized in attention mechanisms. + + Parameters: + - bidirectional (bool): If True, considers both forward and backward relative positions. Default: True. + - num_buckets (int): Number of buckets to cluster relative position distances. Default: 32. + - max_distance (int): Maximum distance to be considered for bucketing. Distances beyond this will be mapped to the last bucket. Default: 128. + - n_heads (int): Number of attention heads. Default: 12. + """ +``` + +### Key Methods: +- **_relative_position_bucket**: This static method is responsible for converting relative positions into bucket indices. +- **compute_bias**: Computes the relative position bias for given lengths of queries and keys. +- **forward**: Computes and returns the relative position biases for a batch. + +## Usage Examples: + +```python +from zeta import RelativePositionBias +import torch + +# Initialize the RelativePositionBias module +rel_pos_bias = RelativePositionBias() + +# Example 1: Compute bias for a single batch +bias_matrix = rel_pos_bias(1, 10, 10) + +# Example 2: Utilize in conjunction with an attention mechanism +# NOTE: This is a mock example, and may not represent an actual attention mechanism's complete implementation. +class MockAttention(nn.Module): + def __init__(self): + super().__init__() + self.rel_pos_bias = RelativePositionBias() + + def forward(self, queries, keys): + bias = self.rel_pos_bias(queries.size(0), queries.size(1), keys.size(1)) + # Further computations with bias in the attention mechanism... + return None # Placeholder + +# Example 3: Modify default configurations +custom_rel_pos_bias = RelativePositionBias(bidirectional=False, num_buckets=64, max_distance=256, n_heads=8) +``` + +## Tips: +1. The choice of `num_buckets` and `max_distance` might need tuning based on the dataset and application. +2. If the architecture doesn't need bidirectional biases, set `bidirectional` to `False` to reduce computation. +3. Ensure that the device of tensors being processed and the device of the `RelativePositionBias` module are the same. + +## References: +- [Attention Is All You Need](https://arxiv.org/abs/1706.03762) +- [Transformer Architectures](https://www.aclweb.org/anthology/D18-1422.pdf) + +Note: This documentation is based on the provided code and might need adjustments when integrated into the complete `zeta` library. \ No newline at end of file diff --git a/docs/zeta/nn/biases/xpos.md b/docs/zeta/nn/biases/xpos.md new file mode 100644 index 0000000..88b46b4 --- /dev/null +++ b/docs/zeta/nn/biases/xpos.md @@ -0,0 +1,105 @@ +# XPOS Module Documentation +------------------------- + +### Architecture + +The XPOS module is a part of a neural network model and is implemented as a subclass ofย `torch.nn.Module`. It consists of several functions and a class that work together to apply rotary positional embeddings to an input tensor. + +### Purpose + +The purpose of the XPOS module is to incorporate positional information into the input tensor of a neural network model. It achieves this by generating fixed positional embeddings and applying them to the input tensor using rotary positional encoding techniques. This allows the model to capture the sequential order and relative positions of the input elements, which can be beneficial for tasks such as natural language processing and time series analysis. + +### Functions and Methods + +1. `fixed_pos_embedding(x)`: Generates fixed positional embeddings for the input tensor. + + - Args: + - `x`ย (torch.Tensor): Input tensor of shapeย `(seq_len, dim)`. + - Returns: + - `sin`ย (torch.Tensor): Sine positional embeddings of shapeย `(seq_len, dim)`. + - `cos`ย (torch.Tensor): Cosine positional embeddings of shapeย `(seq_len, dim)`. +2. `rotate_every_two(x)`: Rearranges the elements of the input tensor by rotating every two elements. + + - Args: + - `x`ย (torch.Tensor): Input tensor of shapeย `(batch_size, seq_len, dim)`. + - Returns: + - `x`ย (torch.Tensor): Rearranged tensor of shapeย `(batch_size, seq_len, dim)`. +3. `duplicate_interleave(m)`: Duplicates a matrix while interleaving the copy. + + - Args: + - `m`ย (torch.Tensor): Input matrix. + - Returns: + - `m`ย (torch.Tensor): Duplicated and interleaved matrix. +4. `apply_rotary_pos_emb(x, sin, cos, scale=1)`: Applies rotary positional embeddings to the input tensor. + + - Args: + - `x`ย (torch.Tensor): Input tensor of shapeย `(batch_size, seq_len, dim)`. + - `sin`ย (torch.Tensor): Sine positional embeddings of shapeย `(seq_len, dim)`. + - `cos`ย (torch.Tensor): Cosine positional embeddings of shapeย `(seq_len, dim)`. + - `scale`ย (float): Scaling factor for the positional embeddings (default: 1). + - Returns: + - `x`ย (torch.Tensor): Tensor with applied rotary positional embeddings. +5. `XPOS(head_dim, scale_base=512)`: XPOS module class. + + - Args: + - `head_dim`ย (int): Dimensionality of the input tensor. + - `scale_base`ย (int): Base value for scaling the positional embeddings (default: 512). + - Methods: + - `forward(x, offset=0, downscale=False)`: Forward pass of the XPOS module. + - Args: + - `x`ย (torch.Tensor): Input tensor of shapeย `(batch_size, seq_len, dim)`. + - `offset`ย (int): Offset value for positional embeddings (default: 0). + - `downscale`ย (bool): Boolean indicating whether to downscale the positional embeddings (default: False). + - Returns: + - `x`ย (torch.Tensor): Tensor with applied rotary positional embeddings. + +### Usage Examples + +1. Applying XPOS module to an input tensor: + + ``` + import torch + from xpos import XPOS + + # Create an instance of the XPOS module + xpos = XPOS(head_dim=256) + + # Generate a random input tensor + x = torch.randn(1, 10, 256) + + # Apply the XPOS module to the input tensor + output = xpos(x) + ``` + + +2. Applying XPOS module with offset and downscaling: + + ``` + import torch + from zeta import XPOS + + # Create an instance of the XPOS module + xpos = XPOS(head_dim=512) + + # Generate a random input tensor + x = torch.randn(1, 20, 512) + + # Apply the XPOS module to the input tensor with offset and downscaling + output = xpos(x, offset=2, downscale=True) + ``` +3. Using the individual functions of the XPOS module: + + ``` + import torch + from zeta import fixed_pos_embedding, apply_rotary_pos_emb + + # Generate fixed positional embeddings + scale = torch.randn(10, 256) + sin, cos = fixed_pos_embedding(scale) + + # Apply rotary positional embeddings to an input tensor + x = torch.randn(1, 10, 256) + output = apply_rotary_pos_emb(x, sin, cos, scale=0.5) + ``` + +Note: The above examples assume that theย `xpos.py`ย file \ No newline at end of file diff --git a/docs/zeta/nn/embeddings/multiway.md b/docs/zeta/nn/embeddings/multiway.md new file mode 100644 index 0000000..e8d998a --- /dev/null +++ b/docs/zeta/nn/embeddings/multiway.md @@ -0,0 +1,123 @@ +# **Documentation for `MultiwayEmbedding` in Zeta Library** + +**Table of Contents** + +1. Overview +2. Class Definition and Parameters +3. Methods and Functionalities +4. Usage Examples +5. Additional Tips and Information +6. References + +--- + +## 1. Overview + +The `MultiwayEmbedding` class in the Zeta library provides a way to apply two separate embeddings to two distinct parts of the input tensor. It splits the input tensor at the specified position and applies one embedding to the first part and another embedding to the second part. This can be particularly useful when dealing with inputs that require diverse representations or embeddings. + +--- + +## 2. Class Definition and Parameters + +```python +class MultiwayEmbedding(MultiwayNetwork): + """ + A specialized version of the MultiwayNetwork to perform multi-way embeddings on an input tensor. + + Parameters: + - modules (List[nn.Module]): A list containing exactly two PyTorch modules. Typically these would be embedding layers. + - dim (int): The dimension along which to split and concatenate the input tensor. Default is 1. + """ + + def __init__(self, modules, dim=1): + super(MultiwayNetwork, self).__init__() + ... +``` + +--- + +## 3. Methods and Functionalities + +**forward(x, **kwargs)** +```python +def forward(self, x, **kwargs): + """ + Forward method to apply embeddings on the split input tensor. + + Parameters: + - x (torch.Tensor): The input tensor. + - **kwargs: Additional arguments that might be needed for the embeddings. + + Returns: + - torch.Tensor: Concatenated tensor after applying the embeddings. + """ + ... +``` + +--- + +## 4. Usage Examples + +**Example 1:** Basic Usage +```python +from zeta import MultiwayEmbedding +import torch.nn as nn + +emb1 = nn.Embedding(10, 5) +emb2 = nn.Embedding(10, 5) +multiway_emb = MultiwayEmbedding([emb1, emb2]) + +x = torch.LongTensor([[1,2,3],[4,5,6]]) +output = multiway_emb(x) +print(output) +``` + +**Example 2:** Setting a Split Position +```python +from zeta import MultiwayEmbedding, set_split_position +import torch.nn as nn + +emb1 = nn.Embedding(10, 5) +emb2 = nn.Embedding(10, 5) +multiway_emb = MultiwayEmbedding([emb1, emb2]) +multiway_emb.apply(set_split_position(2)) + +x = torch.LongTensor([[1,2,3],[4,5,6]]) +output = multiway_emb(x) +print(output) +``` + +**Example 3:** Working with Different Embedding Dimensions +```python +from zeta import MultiwayEmbedding +import torch.nn as nn + +emb1 = nn.Embedding(10, 5) +emb2 = nn.Embedding(10, 7) +multiway_emb = MultiwayEmbedding([emb1, emb2], dim=2) + +x = torch.LongTensor([[1,2,3],[4,5,6]]) +output = multiway_emb(x) +print(output) +``` + +--- + +## 5. Additional Tips and Information + +- Ensure that the input tensor's dimensions align with the expected embeddings. If there's a mismatch in dimensions, a runtime error will occur. +- The split position determines the point at which the tensor is divided. It's crucial to set this appropriately, especially if the embeddings have different dimensions. +- Using the provided `set_split_position` utility function makes it easy to apply the split position for the embeddings. + +--- + +## 6. References + +- Torch documentation: [Link to PyTorch Documentation](https://pytorch.org/docs/stable/index.html) +- Agora: [Link to Agora's GitHub](#) (assuming there might be a GitHub link or other resource for Agora) + +--- + +**Note:** Ensure that the tensor operations align mathematically, especially if you're concatenating tensors with different dimensions. In such cases, ensure the embeddings produce tensors that can be concatenated along the specified dimension. + +**Mathematical Explanation:** Given an input tensor \( X \) split into \( X_1 \) and \( X_2 \), and two embeddings \( A \) and \( B \), the output is given by concatenating \( A(X_1) \) and \( B(X_2) \). \ No newline at end of file diff --git a/docs/zeta/nn/embeddings/rope.md b/docs/zeta/nn/embeddings/rope.md new file mode 100644 index 0000000..7dd8622 --- /dev/null +++ b/docs/zeta/nn/embeddings/rope.md @@ -0,0 +1,145 @@ +# RotaryEmbedding + +`RotaryEmbedding` is a PyTorch module implementing the rotary embedding mechanism. It is designed to handle sequences of any length without the need for fine-tuning, and can also incorporate positional information into the embeddings. + +## Class Definition + +```python +class RotaryEmbedding(nn.Module): + def __init__( + self, + dim, + use_xpos=False, + scale_base=512, + interpolation_factor=1., + base=10000, + base_rescale_factor=1., + ): + ... +``` + +### Parameters + +- `dim` (int): The dimensionality of the embeddings. +- `use_xpos` (bool, optional): Whether to use positional information in the embeddings. Default: `False`. +- `scale_base` (int, optional): Base of the scale for positional information. Default: `512`. +- `interpolation_factor` (float, optional): Factor used for interpolating the embeddings. Default: `1.0`. +- `base` (int, optional): Base of the frequencies used in the embeddings. Default: `10000`. +- `base_rescale_factor` (float, optional): Factor used for rescaling the base of the frequencies. Default: `1.0`. + +### Method: `forward` + +```python +def forward(self, seq_len, device): + ... +``` + +#### Parameters + +- `seq_len` (int): The length of the sequence. +- `device` (torch.device): The device on which the computation will be performed. + +#### Returns + +- `freqs` (Tensor): The computed frequencies for the embeddings. +- `scale` (Tensor): The computed scale for the embeddings. + +## Functionality and Usage + +The `RotaryEmbedding` module computes rotary embeddings for a sequence of a given length. The embeddings are computed based on the frequency and scale of each position in the sequence. The frequency and scale are computed using the `inv_freq` and `scale` buffers registered in the module. + +The `forward` method computes the `freqs` and `scale` tensors based on the `seq_len` and `device` provided. The `freqs` tensor is computed by multiplying the `t` tensor, which contains the indices of the sequence, with the `inv_freq` tensor. The `scale` tensor is computed using the `scale` buffer and the `scale_base` parameter. + +The `freqs` and `scale` tensors are then concatenated along the last dimension and returned. + +### Usage Examples + +#### Example 1: Basic Usage + +```python +from zeta.nn import RotaryEmbedding +import torch +from torch import nn + +# Initialize the RotaryEmbedding module +rotary_embedding = RotaryEmbedding(dim=64, use_xpos=True) + +# Compute the embeddings for a sequence of length 10 +seq_len = 10 +device = torch.device('cuda') +freqs, scale = rotary_embedding(seq_len, device) + +print(freqs) +print(scale) +``` + +#### Example 2: Using a Different Scale Base + +```python +from zeta.nn import RotaryEmbedding +import torch +from torch import nn + +# Initialize the RotaryEmbedding module with a different scale base +rotary_embedding = RotaryEmbedding(dim=64, use_xpos=True, scale_base=1024) + +# Compute the embeddings for a sequence of length 10 +seq_len = 10 +device = torch.device('cuda') +freqs, scale = rotary_embedding(seq_len, device) + +print(freqs) +print(scale) +``` + +#### Example 3: Without Positional Information + +```python +from zeta.nn import RotaryEmbedding +import torch +from torch import nn + +# Initialize the RotaryEmbedding module without positional information +rotary_embedding = RotaryEmbedding(dim=64, use_xpos=False) + +# Compute the embeddings for a sequence of length 10 +seq_len = 10 +device = torch.device('cuda') +freqs, scale = rotary_embedding(seq_len, device) + +print(freqs) +print(scale) +``` + +## Mathematical Formula + +The mathematical formula for computing the `freqs` tensor is: + +\[ \text{freqs} = t \cdot \text{inv\_freq} \] + +Where: +- \( t \) is a tensor containing the indices of the sequence. +- \( \text{inv\_freq} \) is a tensor containing the inverse frequencies. + +The mathematical formula for computing the `scale` tensor is: + +\[ \text{scale} = \text{scale}^{\frac{\text{power}}{\text{scale\_base}}} \] + +Where: +- \( \text{power} \) is a tensor containing the power of each position in the sequence. +- \( \text{scale\_base} \) is a scalar containing the base of the scale. +- \( \text{scale} \) is a tensor containing the scale of each position in the sequence. + +## Additional Information and Tips + +- The `interpolation_factor` parameter can be used to interpolate the embeddings for sequences of different lengths. A larger `interpolation_factor` will result in a smoother interpolation. +- The `base_rescale_factor` parameter can be used to rescale the base of the frequencies. This can be useful for adjusting the embeddings for sequences of different lengths. +- If `use_xpos` is set to `False`, the `scale` tensor will not be used, and the `freqs` tensor will be returned as is. + +## References and Resources + +- [Paper: Link to the paper](https://arxiv.org/pdf/2308.10882.pdf) +- [PyTorch Documentation](https://pytorch.org/docs/stable/indehtml) +- [Einops Documentation](https://einops.rocks/pytorch-examples.html) + +Note: The above template includes the class definition, parameters, description, functionality, usage examples, mathematical formula, additional information and tips, and references and resources. To replicate the documentation for any other module or framework, follow the same structure and provide the specific details for that module or framework. \ No newline at end of file diff --git a/docs/zeta/nn/embeddings/truncated_rope.md b/docs/zeta/nn/embeddings/truncated_rope.md new file mode 100644 index 0000000..d0acd0c --- /dev/null +++ b/docs/zeta/nn/embeddings/truncated_rope.md @@ -0,0 +1,103 @@ +# Module/Function Name: TruncatedRotaryEmbedding + +The `TruncatedRotaryEmbedding` class is part of the Zeta library and is designed to implement the rotary embeddings with a truncation mechanism. The rotary embedding is a positional encoding method that aims to provide the model with information about the relative positions of the tokens in a sequence. The `TruncatedRotaryEmbedding` class extends the rotary embedding concept by incorporating a truncation mechanism, which sets the rotary embedding to zero for positions where the frequency is higher than a specified threshold. + +The architecture and workings of this class are inspired by the paper [link to the paper](https://arxiv.org/pdf/2308.10882.pdf). + +## Parameters: + +- `dim` (int): Dimensionality of the embeddings. +- `a` (float): Lower bound of the truncation region. Rotary embeddings with frequency lower than `a` will be set to zero. +- `b` (float): Upper bound of the truncation region. Rotary embeddings with frequency higher than or equal to `b` will not be truncated. +- `rho` (float): Value to which the rotary embeddings will be truncated in the region [a, b). + +The `dim` parameter is required to determine the dimensionality of the embeddings, while `a`, `b`, and `rho` are hyperparameters that control the truncation mechanism. + +## Method: + +### `forward(seq_len, device)` + +Computes the truncated rotary embeddings for a given sequence length. + +#### Parameters: + +- `seq_len` (int): Length of the sequence for which the rotary embeddings are to be computed. +- `device` (torch.device): Device on which the computations are to be performed. + +#### Returns: + +- `result` (Tensor): A tensor containing the truncated rotary embeddings for the specified sequence length. + +## Functionality and Usage: + +The `TruncatedRotaryEmbedding` class is used to compute the truncated rotary embeddings for a given sequence length. The rotary embeddings are computed by multiplying a tensor containing the position indices of the tokens in the sequence by the inverse frequencies. The inverse frequencies are computed based on the specified embedding dimension `dim` and are stored in the `inv_freq` buffer. + +The truncation mechanism is implemented by creating a `theta_star` tensor, which is used to multiply the computed `freqs`. The `theta_star` tensor is created based on the specified `a`, `b`, and `rho` parameters, and the computed `freqs` tensor. For positions where the frequency is higher than or equal to `b`, the rotary embeddings are not truncated, and `theta_star` is set to the frequency at that position. For positions where the frequency is lower than `a`, the rotary embeddings are set to zero, and `theta_star` is set to zero. For positions where the frequency is in the range [a, b], the rotary embeddings are truncated to `rho`, and `theta_star` is set to `rho`. + +Once the `theta_star` tensor is created, it is multiplied element-wise by the `freqs` tensor to compute the final truncated rotary embeddings. + +### Usage Example: + +```python +from zeta.nn.embeddings.truncated_rope import TruncatedRotaryEmbedding +import torch + +# Define the parameters +dim = 64 +a = 0.1 +b = 0.9 +rho = 0.5 +seq_len = 100 +device = torch.device('cuda') + +# Create the TruncatedRotaryEmbedding module +trunc_rotary_emb = TruncatedRotaryEmbedding(dim, a, b, rho) + +# Compute the truncated rotary embeddings for the specified sequence length +rotary_embeddings = trunc_rotary_emb(seq_len, device) + +print(rotary_embeddings) +``` + +In this example, the `TruncatedRotaryEmbedding` module is created with the specified `dim`, `a`, `b`, and `rho` parameters. The `forward` method is then called with the specified `seq_len` and `device` parameters to compute the truncated rotary embeddings for a sequence of length `seq_len`. + +## Additional Information and Tips: + +- The `a`, `b`, and `rho` parameters control the truncation mechanism and may need to be tuned based on the specific application and data being used. In particular, the `a` parameter should be set to a value that effectively removes the high-frequency noise in the rotary embeddings, while the `b` parameter should be set to a value that retains the useful positional information in the rotary embeddings. + +- The `dim` parameter should be set to the same value as the embedding dimension used in the model. + +- The `device` parameter in the `forward` method should be set to the same device on which the model is being trained. + +## Mathematical Formulation: + +The mathematical formulation of the truncated rotary embeddings can be expressed as follows: + +\[ \text{freqs} = t \cdot \text{inv\_freq} \] + +\[ \theta = \text{base}^{-2 \cdot i / \text{dim}}, \, i = 0, 2, \ldots, \text{dim}-2 \] + +\[ \theta^* = +\begin{cases} +0, & \text{if } \theta < a \\ +\rho, & \text{if } a \leq \theta < b \\ +\theta, & \text{if } \theta \geq b +\end{cases} +\] + +\[ \text{result} = \text{freqs} \cdot \theta^* \] + +Where: + +- \( t \) is a tensor containing the position indices of the tokens in the sequence. +- \( \text{inv\_freq} \) is a tensor containing the inverse frequencies computed based on the specified `dim` parameter. +- \( \text{freqs} \) is a tensor containing the computed frequencies for each position in the sequence. +- \( \theta \) is a tensor containing the computed theta values for each position in the sequence. +- \( \theta^* \) is a tensor containing the truncated theta values for each position in the sequence. +- \( \text{result} \) is the final tensor containing the truncated rotary embeddings for each position in the sequence. + +## References and Resources: + +- Paper: [Link to the paper](https://arxiv.org/pdf/2308.10882.pdf) + +For further exploration and implementation details, refer to the paper linked above. \ No newline at end of file diff --git a/docs/zeta/nn/modules/lora.md b/docs/zeta/nn/modules/lora.md new file mode 100644 index 0000000..84c0a7a --- /dev/null +++ b/docs/zeta/nn/modules/lora.md @@ -0,0 +1,160 @@ +# Lora + +The `Lora` class is a module of the Zeta library that provides a simple linear transformation of the input data. It is a part of the `torch.nn` module and extends the `nn.Module` class from PyTorch. + +## Overview and Introduction + +The `Lora` class is designed to provide a scalable and efficient linear transformation operation. It is particularly useful in scenarios where the dimensionality of the input data is very high and computational efficiency is of paramount importance. The `Lora` class achieves this by breaking down the weight matrix into two lower rank matrices `A` and `B`, and a scale factor `alpha`, which are learned during the training process. This results in a significant reduction in the number of parameters to be learned, and consequently, a more computationally efficient model. + +## Key Concepts and Terminology + +- **Linear Transformation**: A linear transformation is a mathematical operation that transforms input data by multiplying it with a weight matrix. It is a fundamental operation in many machine learning models. + +- **Low Rank Approximation**: Low rank approximation is a technique used to approximate a matrix by another matrix of lower rank. This is often used to reduce the dimensionality of data and to make computations more efficient. + +- **Scale Factor**: A scale factor is a number by which a quantity is multiplied, changing the magnitude of the quantity. + +## Class Definition + +The `Lora` class is defined as follows: + +```python +class Lora(nn.Module): + def __init__( + self, + dim, + dim_out, + r=8, + alpha=None + ): + super().__init__() + self.scale = alpha / r + + self.A = nn.Parameter(torch.randn(dim, r)) + self.B = nn.Parameter(torch.randn(r, dim_out)) + + @property + def weight(self): + return (self.A @ self.B) * self.scale + + def forward(self, x): + return x @ self.weight +``` + +### Parameters + +- `dim` (`int`): The dimensionality of the input data. It is the number of features in the input data. +- `dim_out` (`int`): The desired dimensionality of the output data. It is the number of features in the output data. +- `r` (`int`, optional): The rank of the matrices `A` and `B`. It determines the size of the matrices `A` and `B`. Default is 8. +- `alpha` (`float`, optional): The scale factor. If not provided, it is set to 1 by default. + +### Methods + +#### `forward` + +The `forward` method is used to compute the forward pass of the `Lora` module. + +##### Parameters + +- `x` (`Tensor`): The input data. It is a tensor of shape `(batch_size, dim)`. + +##### Returns + +- `Tensor`: The transformed data. It is a tensor of shape `(batch_size, dim_out)`. + +## Functionality and Usage + +The `Lora` class is used to perform a linear transformation of the input data. The transformation is defined by the weight matrix `W`, which is approximated by the product of two lower rank matrices `A` and `B`, and a scale factor `alpha`. The `Lora` class learns the matrices `A` and `B`, and the scale factor `alpha` during the training process. + +The forward pass of the `Lora` module computes the product of the input data `x` and the weight matrix `W`, which is approximated by `(A @ B) * scale`. + +### Mathematical Formula + +The mathematical formula for the forward pass of the `Lora` module is: + +\[ y = xW \] + +Where: +- \( y \) is the transformed data. +- \( x \) is the input data. +- \( W \) is the weight matrix, which is approximated by \( (A @ B) * \text{scale} \). + +### Usage Examples + +Below are three examples of how to use the `Lora` class. + +#### Example 1: Basic Usage + +```python +import torch +from zeta import Lora + +# Define the input data +x = torch.randn(32, 128) # batch size of 32, and 128 features + +# Define the Lora module +lora = Lora(dim=128, dim_out=64) + +# Compute the forward pass +y = lora(x) +``` + +#### Example 2: Specifying the Rank and Scale Factor + +```python +import torch +from zeta import Lora + +# Define the input data +x = torch.randn(32, 128) # batch size of 32, and 128 features + +# Define the Lora module with specified rank and scale factor +lora = Lora(dim=128, dim_out=64, r=16, alpha=0.1) + +# Compute the forward pass +y = lora(x) +``` + +#### Example 3: Using the Lora Module in a Neural Network + +```python +import torch +from torch import nn +from zeta import Lora + +# Define a simple neural network with a Lora layer +class Net(nn.Module): + def __init__(self): + super().__init__() + self.lora = Lora(dim=128, dim_out=64) + self.fc = nn.Linear(64, 10) + + def forward(self, x): + x = self.lora(x) + x = self.fc(x) + return x + +# Define the input data +x = torch.randn(32, 128) # batch size of 32, and 128 features + +# Define the model +model = Net() + +# Compute the forward pass +output = model(x) +``` + +## Additional Information and Tips + +- The `Lora` class is particularly useful in scenarios where the dimensionality of the input data is very high and computational efficiency is of paramount importance. However, it may not be suitable for all applications, as the approximation of the weight matrix may result in a loss of accuracy. + +- The rank `r` and the scale factor `alpha` are hyperparameters that need to be tuned for the specific application. A higher value of `r` will + + result in a more accurate approximation of the weight matrix, but will also increase the computational cost. Similarly, the scale factor `alpha` needs to be tuned to achieve the desired trade-off between accuracy and computational efficiency. + +## References and Resources + +- [PyTorch nn.Module documentation](https://pytorch.org/docs/stable/generated/torch.nn.Module.html) +- [Low Rank Matrix Factorization for Deep Neural Network Training with High-dimensional Output Targets](https://arxiv.org/abs/2005.08735) + +For further exploration and implementation details, you can refer to the above resources and the official PyTorch documentation. \ No newline at end of file diff --git a/docs/zeta/nn/modules/token_learner.md b/docs/zeta/nn/modules/token_learner.md new file mode 100644 index 0000000..794dd77 --- /dev/null +++ b/docs/zeta/nn/modules/token_learner.md @@ -0,0 +1,148 @@ +# Zeta Library Documentation + +## Module Name: TokenLearner + +The `TokenLearner` is a PyTorch module designed for learning tokens from input data. It is a part of the Zeta library, a collection of modules and functions designed for efficient and flexible implementation of various deep learning tasks. The `TokenLearner` class is particularly useful for tasks such as image classification, object detection, and other applications where it is beneficial to extract tokens (representative features) from the input data. + +## Introduction + +In various deep learning tasks, it is common to extract tokens (representative features) from the input data. These tokens are then used for downstream tasks like classification, detection, etc. The `TokenLearner` class is designed to efficiently extract tokens from the input data. It does this by utilizing a convolutional neural network (CNN) with grouped convolutions and a gating mechanism. + +## Class Definition + +```python +class TokenLearner(nn.Module): + def __init__( + self, + *, + dim: int = None, + ff_mult: int = 2, + num_output_tokens: int = 8, + num_layers: int = 2 + ): + ... +``` + +### Parameters: + +- `dim` (int, optional): The dimension of the input data. Default is `None`. +- `ff_mult` (int, optional): The factor by which the inner dimension of the network will be multiplied. Default is `2`. +- `num_output_tokens` (int, optional): The number of tokens to be output by the network. Default is `8`. +- `num_layers` (int, optional): The number of layers in the network. Default is `2`. + +## Functionality and Usage + +The `TokenLearner` class is a PyTorch `nn.Module` that learns tokens from the input data. The input data is first packed and then processed through a series of grouped convolutions followed by a gating mechanism. The output is a set of tokens that are representative of the input data. + +The forward method of the `TokenLearner` class takes an input tensor `x` and performs the following operations: + +1. The input tensor `x` is packed using the `pack_one` helper function. +2. The packed tensor is then rearranged and passed through a series of grouped convolutions and activation functions. +3. The output of the convolutions is then rearranged and multiplied with the input tensor. +4. The resulting tensor is then reduced to obtain the final tokens. + +### Method: + +```python +def forward(self, x): + ... +``` + +### Parameters: + +- `x` (Tensor): The input tensor of shape `(batch_size, channels, height, width)`. + +### Returns: + +- `x` (Tensor): The output tokens of shape `(batch_size, channels, num_output_tokens)`. + +## Usage Examples + +### Example 1: Basic Usage + +```python +from zeta import TokenLearner +import torch + +# Initialize the TokenLearner +token_learner = TokenLearner(dim=64) + +# Generate some random input data +x = torch.randn(1, 64, 32, 32) + +# Forward pass +tokens = token_learner.forward(x) + +print(tokens.shape) +``` + +In this example, a `TokenLearner` is initialized with an input dimension of 64. A random tensor of shape `(1, 64, 32, 32)` is then passed through the `TokenLearner` to obtain the tokens. The output will be a tensor of shape `(1, 64, 8)`. + +### Example 2: Custom Parameters + +```python +from zeta import TokenLearner +import torch + +# Initialize the TokenLearner with custom parameters +token_learner = TokenLearner(dim=128, ff_mult=4, num_output_tokens=16) + +# Generate some random input data +x = torch.randn(2, 128, 64, 64) + +# Forward pass +tokens = token_learner.forward(x) + +print(tokens.shape) +# Output: torch.Size([2, 128, 16]) +``` + +In this example, a `TokenLearner` is initialized with custom parameters. A random tensor of shape `(2, 128, 64, 64)` is then passed through the `TokenLearner` to obtain the tokens. The output will be a tensor of shape `(2, 128, 16)`. + +### Example 3: Integration with Other PyTorch Modules + +```python +from zeta import TokenLearner +import torch +import torch.nn as nn + +# Initialize the TokenLearner +token_learner = TokenLearner(dim=64) + +# Generate some random input data +x = torch.randn(1, 64, 32, 32) + +# Define a simple model +model = nn.Sequential( + token_learner, + nn.Flatten(), + nn.Linear(64*8, 10) +) + +# Forward pass +output = model(x) + +print(output.shape) +# Output: torch.Size([1, 10]) +``` + +In this example, the `TokenLearner` is integrated into a simple model consisting of the `TokenLearner`, a `Flatten` layer, and a `Linear` layer. A random tensor of shape `(1, 64, 32, 32)` is then passed through the model to obtain the final output. The output will be a tensor of shape `(1, 10)`. + +## Mathematical Formulation + +The `TokenLearner` can be mathematically formulated as follows: + +Let `X` be the input tensor of shape `(B, C, H, W)`, where `B` is the batch size, `C` is the number of channels, `H` is the height, and `W` is the width. The `TokenLearner` first rearranges `X` to a tensor of shape `(B, G*C, H, W)`, where `G` is the number of output tokens. This is done by repeating `X` along the channel dimension `G` times. + +The rearranged tensor is then passed through a series of grouped convolutions and activation functions to obtain a tensor `A` of shape `(B, G, H, W)`. This tensor is then rearranged and multiplied with the input tensor `X` to obtain a tensor of shape `(B, C, G, H, W)`. + +The final tokens are obtained by reducing this tensor along the `H` and `W` dimensions to obtain a tensor of shape `(B, C, G)`. + +## Additional Information and Tips + +- The `num_output_tokens` parameter controls the number of tokens that will be output by the `TokenLearner`. A larger number of output tokens will result in a more detailed representation of the input data, but will also increase the computational requirements. + +- The `ff_mult` parameter controls the inner dimension of the `TokenLearner`. A larger `ff_mult` will result in a larger capacity model, but will also increase the computational requirements. + +- The `TokenLearner` works best with input data that has a relatively small spatial dimension (e.g. 32x32 or 64x64). For larger input sizes, it may be beneficial to use a downsampling layer (e.g. `nn.MaxPool2d`) before passing the data through the `TokenLearner`. + diff --git a/docs/zeta/nn/utils/helpers.md b/docs/zeta/nn/utils/helpers.md new file mode 100644 index 0000000..6c518a0 --- /dev/null +++ b/docs/zeta/nn/utils/helpers.md @@ -0,0 +1,109 @@ +## Documentation + +### Overview + +The provided module comprises utility functions and classes to streamline specific operations with Python data structures and PyTorch models. The main aspects of the module are: + +- Checking the existence of a value. +- Implementing custom call behavior through classes. +- Custom decorators for function calls. +- Dictionary manipulation. +- Initialization of PyTorch layer parameters. + +### Functions and Classes + +1. **exists(val: Any) -> bool**: + Checks if the provided value is not `None`. + +2. **default(val: Any, d: Any) -> Any**: + Returns the value if it's not `None`; otherwise, it returns a default value. + +3. **once(fn: Callable) -> Callable**: + A decorator ensuring that the function is only called once. + +4. **eval_decorator(fn: Callable) -> Callable**: + A decorator for `torch.nn.Module` methods to switch the module to `eval` mode during the function call and revert to its original mode afterwards. + +5. **cast_tuple(val: Any, depth: int) -> Tuple**: + Casts a value to a tuple with a specific depth. + +6. **maybe(fn: Callable) -> Callable**: + A decorator that calls the function only if its first argument exists. + +7. **always**: + A class that always returns the specified value when called. + +8. **not_equals** and **equals**: + Classes that, when instantiated with a value, check if another value is (not) equal to the specified value. + +9. **init_zero_(layer: nn.Module) -> None**: + Initializes the weights and biases of a torch layer to zero. + +10. **pick_and_pop(keys: List[str], d: Dict) -> Dict**: + Extracts values from a dictionary based on provided keys. + +11. **group_dict_by_key(cond: Callable, d: Dict) -> Tuple[Dict, Dict]**: + Groups dictionary keys based on a given condition. + +12. **string_begins_with(prefix: str, str: str) -> bool**: + Checks if a string starts with a specific prefix. + +13. **group_by_key_prefix(prefix: str, d: Dict) -> Tuple[Dict, Dict]**: + Groups dictionary items by keys starting with a specific prefix. + +14. **groupby_prefix_and_trim(prefix: str, d: Dict) -> Tuple[Dict, Dict]**: + Similar to `group_by_key_prefix` but also removes the prefix from keys. + +### Usage Examples + +1. **Using the `once` decorator**: + + ```python + from zeta import once + + @once + def greet(): + print("Hello, World!") + + greet() # prints "Hello, World!" + greet() # Does nothing on the second call + ``` + +2. **Using the `eval_decorator` with PyTorch**: + + ```python + import torch.nn as nn + from zeta import eval_decorator + + class SimpleModel(nn.Module): + def __init__(self): + super().__init__() + self.layer = nn.Linear(10, 10) + + @eval_decorator + def predict(self, x): + return self.layer(x) + + model = SimpleModel() + input_tensor = torch.randn(1, 10) + output = model.predict(input_tensor) # Automatically switches to eval mode and back + ``` + +3. **Dictionary Manipulation with Prefix Functions**: + + ```python + from zeta import group_by_key_prefix + + sample_dict = { + "user_name": "John", + "user_age": 25, + "order_id": 12345, + "order_date": "2023-01-01" + } + + user_data, order_data = group_by_key_prefix("user_", sample_dict) + print(user_data) # {'user_name': 'John', 'user_age': 25} + print(order_data) # {'order_id': 12345, 'order_date': '2023-01-01'} + ``` + +This module is a collection of general-purpose utility functions and classes, making many common operations more concise. It's beneficial when working with PyTorch models and various data manipulation tasks. \ No newline at end of file diff --git a/docs/zeta/tokenizers/language_tokenizer.md b/docs/zeta/tokenizers/language_tokenizer.md new file mode 100644 index 0000000..cfa3609 --- /dev/null +++ b/docs/zeta/tokenizers/language_tokenizer.md @@ -0,0 +1,91 @@ +# Module Name: LanguageTokenizerGPTX + +The `LanguageTokenizerGPTX` is an embedding utility tailored for the "EleutherAI/gpt-neox-20b" transformer model. This class allows for seamless tokenization and decoding operations, abstracting away the underlying complexity of the chosen transformer's tokenizer. + +## Introduction: +Language tokenization is a crucial step in natural language processing tasks. This module provides an interface to tokenize and decode text using the GPT-Neox-20b transformer from the EleutherAI project. With the ability to manage end-of-string tokens, padding tokens, and a fixed model length, `LanguageTokenizerGPTX` serves as a convenient wrapper for the actual tokenizer from the transformers library. + +## Class Definition: + +```python +class LanguageTokenizerGPTX: + def __init__(self): + ... + def tokenize_texts(self, texts: str) -> torch.Tensor: + ... + def decode(self, texts: torch.Tensor) -> str: + ... + def __len__(self) -> int: + ... +``` + +### Parameters: +The class does not take any parameters upon instantiation. It uses predefined parameters internally to load the tokenizer. + +### Methods: + +#### 1. `__init__(self) -> None`: +Initializes the `LanguageTokenizerGPTX` object. This method loads the `AutoTokenizer` with predefined parameters. + +#### 2. `tokenize_texts(self, texts: str) -> torch.Tensor`: +Tokenizes a given text or list of texts. + +- **texts** (str): The input text(s) to tokenize. + + **Returns**: + - A torch Tensor of token IDs representing the input text(s). + +#### 3. `decode(self, texts: torch.Tensor) -> str`: +Decodes a given tensor of token IDs back to text. + +- **texts** (torch.Tensor): The tensor of token IDs to decode. + + **Returns**: + - A string representing the decoded text. + +#### 4. `__len__(self) -> int`: +Provides the total number of tokens in the tokenizer's vocabulary. + + **Returns**: + - An integer representing the total number of tokens. + +## Usage Examples: + +```python +from zeta import LanguageTokenizerGPTX +import torch + +# Initialize the tokenizer +tokenizer = LanguageTokenizerGPTX() + +# Example 1: Tokenize a single text +text = "Hello, world!" +tokenized_text = tokenizer.tokenize_texts(text) +print(tokenized_text) + +# Example 2: Decode a tokenized text +decoded_text = tokenizer.decode(tokenized_text) +print(decoded_text) + +# Example 3: Get the number of tokens in the tokenizer's vocabulary +num_tokens = len(tokenizer) +print(f"The tokenizer has {num_tokens} tokens.") +``` + +## Mathematical Formulation: + +Given a text \( t \) and a vocabulary \( V \) from the GPT-Neox-20b model, tokenization maps \( t \) to a sequence of token IDs \( T \) where each token ID \( t_i \) corresponds to a token in \( V \). Decoding reverses this process. + +\[ t \xrightarrow{\text{tokenize}} T \] +\[ T \xrightarrow{\text{decode}} t \] + +## Additional Information: + +The GPT-Neox-20b model is part of the EleutherAI project. It's a variant of the GPT architecture with tweaks in terms of model size and training. Utilizing such models require an understanding of tokenization and decoding, which this module aims to simplify. + +## References: + +- [Transformers Library by Hugging Face](https://huggingface.co/transformers/) +- [EleutherAI GPT-Neox](https://github.com/EleutherAI/gpt-neox) + +Note: Ensure you have the necessary packages and dependencies installed, particularly the transformers library from Hugging Face. \ No newline at end of file diff --git a/docs/zeta/tokenizers/multi_modal_tokenizer.md b/docs/zeta/tokenizers/multi_modal_tokenizer.md new file mode 100644 index 0000000..a0f682a --- /dev/null +++ b/docs/zeta/tokenizers/multi_modal_tokenizer.md @@ -0,0 +1,168 @@ +# **Documentation for Zeta Library's MultiModalTokenizer Class** + +--- + +## **Introduction and Overview** + +The `MultiModalTokenizer` class is part of the Zeta Library, designed to provide tokenization capabilities for both text and image data. This enables more seamless integration and utilization of multimodal (text and image) data, especially when used with models that can handle such information simultaneously, like the CLIP model. + +**Key Features**: + +1. **Multimodal Tokenization**: Combines text and image tokenization within one unified class. +2. **Integration with Hugging Face Transformers**: Utilizes the `CLIPProcessor` for image tokenization and `AutoTokenizer` for text tokenization. +3. **Special Tokens for Image Segmentation**: Uses special tokens `` and `` to denote image token boundaries within text. +4. **Error Handling**: Implements comprehensive error handling and logging to ensure robustness. + +--- + +## **Class Definition** + +### **MultiModalTokenizer** + +```python +class MultiModalTokenizer: + """ + A tokenizer class for the kosmos model + + Attributes: + processor(CLIPProcessor): The processor to tokenize images. + tokenizer(AutoTokenizer): The tokenizer to tokenize text. + im_idx(int): The Index of the "" token. + im_end_idx(int): The index of the "" token. + """ +``` + +#### **Parameters**: + +- **max_length (int, optional)**: Maximum length of the tokenized sequence. Defaults to 8192. + +#### **Attributes**: + +- **processor (CLIPProcessor)**: The processor used to tokenize images. +- **tokenizer (AutoTokenizer)**: The tokenizer used to tokenize text. +- **im_idx (int)**: Index of the `` token. +- **im_end_idx (int)**: Index of the `` token. + +--- + +## **Methods** + +### **1. tokenize_texts** + +```python +def tokenize_texts(self, texts: str) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Tokenize given texts. + + Args: + texts (str): The text to be tokenized. + + Returns: + A tuple containing the tokenized texts and only the text tokens. + """ +``` + +### **2. tokenize_images** + +```python +def tokenize_images(self, images) -> torch.Tensor: + """ + Tokenizes given images. + + Args: + images: The images to be tokenized. + + Returns: + The tokenized images. + """ +``` + +### **3. tokenize** + +```python +def tokenize(self, sample) -> Dict[str, torch.Tensor]: + """ + Tokenizes given sample. + + Args: + sample: The sample to be tokenized. + + Returns: + A dictionary containing the tokenized text tokens, images, labels, and attention mask. + """ +``` + +--- + +## **Usage Examples** + +### **Example 1: Tokenizing Texts** + +```python +from zeta import MultiModalTokenizer +import torch + +tokenizer = MultiModalTokenizer() +texts = ["Hello World", "Zeta Library is great!"] +tokenized_texts, only_texts = tokenizer.tokenize_texts(texts) +print(tokenized_texts) +print(only_texts) +``` + +### **Example 2: Tokenizing Images** + +```python +from zeta import MultiModalTokenizer +import torch + +tokenizer = MultiModalTokenizer() +images = torch.randn(2, 3, 224, 224) # Assuming 2 random images of shape 3x224x224 +tokenized_images = tokenizer.tokenize_images(images) +print(tokenized_images) +``` + +### **Example 3: Tokenizing Multimodal Data** + +```python +from zeta import MultiModalTokenizer +import torch + +tokenizer = MultiModalTokenizer() +sample = { + "target_text": ["Hello World", "Zeta Library is great!"], + "image": torch.randn(2, 3, 224, 224) +} +tokenized_data = tokenizer.tokenize(sample) +print(tokenized_data) +``` + +--- + +## **Mathematical Overview** + +Given a text sequence \( T \) of length \( n \) and an image \( I \) represented by a tensor of shape \( C \times H \times W \), where \( C \) is the number of channels, \( H \) is the height, and \( W \) is the width: + +1. The tokenized text, \( T' \), is represented as: + \[ T' = [, , , T_{1}, T_{2}, ..., T_{n}, ] \] + +2. The tokenized image, \( I' \), is processed using the CLIP processor to obtain a tensor representation. + +3. When both text and image data are tokenized using the `tokenize` method, the output contains both \( T' \) and \( I' \) with their respective attention masks. + +--- + +## **Additional Tips** + +- Ensure you have the required model weights and configurations for the specified pretrained models ("laion/CLIP-ViT-L-14-laion2B-s32B-b82K" and "EleutherAI/gpt-neox-20b") downloaded or accessible from the Hugging Face Model Hub. + +- Handle potential tokenization errors gracefully using try-except blocks, as demonstrated in the provided methods. + +--- + +## **References and Resources** + +1. CLIP: Connecting Vision and Language with Reinforced Loss - OpenAI: [Link](https://openai.com/blog/clip/) +2. Hugging Face's Transformers library: [Link](https://huggingface.co/transformers/) +3. Documentation on Special Tokens in Transformers: [Link](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.PreTrainedTokenizer.add_special_tokens) + +--- \ No newline at end of file diff --git a/docs/zeta/tokenizers/sentencepiece.md b/docs/zeta/tokenizers/sentencepiece.md new file mode 100644 index 0000000..caaed72 --- /dev/null +++ b/docs/zeta/tokenizers/sentencepiece.md @@ -0,0 +1,173 @@ +# SentencePieceTokenizer + +`SentencePieceTokenizer` is a class for tokenizing and detokenizing text using a pre-trained SentencePiece model. The SentencePiece model is a unsupervised text tokenizer and detokenizer mainly for Neural Network-based text generation tasks where the vocabulary size is predetermined prior to the neural model training. This class is a part of the zeta library which is a collection of various utility functions and classes for Natural Language Processing tasks. + +## Introduction + +Tokenization is a crucial step in many natural language processing tasks. It involves splitting a piece of text into smaller units, called tokens. These tokens can be as small as characters or as large as words. The `SentencePieceTokenizer` class provides an efficient and easy-to-use way to tokenize and detokenize text using a SentencePiece model. + +The SentencePiece model is trained to find the best tokenization by dynamically adjusting the size and boundary of tokens. SentencePiece implements subword units (e.g., byte-pair-encoding (BPE) and unigram language model with the extension of direct training from raw sentences. SentencePiece allows us to make a purely end-to-end system that does not depend on language-specific pre/postprocessing. + +## Class Definition + +```python +class SentencePieceTokenizer: + def __init__(self, model_path: str): + ... +``` + +### Parameters: + +- `model_path (str)`: The path to the pre-trained SentencePiece model. It should be a file with `.model` extension. + +### Attributes: + +- `n_words (int)`: The vocabulary size of the SentencePiece model. +- `bos_id (int)`: The token ID for the beginning of sentence token. +- `eos_id (int)`: The token ID for the end of sentence token. +- `pad_id (int)`: The token ID for the padding token. +- `prefix_id (int, optional)`: The token ID for the prefix token. +- `middle_id (int, optional)`: The token ID for the middle token. +- `suffix_id (int, optional)`: The token ID for the suffix token. +- `eot_id (int, optional)`: The token ID for the end of text token. + +## Methods + +### `encode` + +```python +def encode(self, s: str, bos: bool, eos: bool) -> List[int]: + ... +``` + +Encodes a string into a list of integer token IDs. + +#### Parameters: + +- `s (str)`: The string to be encoded. +- `bos (bool)`: Whether to add the beginning of sentence token at the start. +- `eos (bool)`: Whether to add the end of sentence token at the end. + +#### Returns: + +- `List[int]`: A list of integer token IDs. + +### `decode` + +```python +def decode(self, t: List[int]) -> str: + ... +``` + +Decodes a list of integer token IDs into a string. + +#### Parameters: + +- `t (List[int])`: A list of integer token IDs to be decoded. + +#### Returns: + +- `str`: The decoded string. + +### `encode_infilling` + +```python +def encode_infilling(self, s: str) -> List[int]: + ... +``` + +Encodes a string without an implicit leading space. + +#### Parameters: + +- `s (str)`: The string to be encoded. + +#### Returns: + +- `List[int]`: A list of integer token IDs. + +### `decode_infilling` + +```python +def decode_infilling(self, t: List[int]) -> str: + ... +``` + +Decodes a list of integer token IDs into a string without an implicit leading space. + +#### Parameters: + +- `t (List[int])`: A list of integer token IDs to be decoded. + +#### Returns: + +- `str`: The decoded string. + +## Usage Examples + +### Example 1: + +```python +from zeta import SentencePieceTokenizer + +tokenizer = SentencePieceTokenizer(model_path='path/to/your/model.model') +text = "Hello, world!" +tokens = tokenizer.encode(text, bos=True, eos=True) +print(tokens) +# [2, 284, 16, 250, 13, 849, 4, 3] + +decoded_text = tokenizer.decode(tokens) +print(decoded_text) +# "Hello, world!" +``` + +### Example 2: + +```python +from zeta import SentencePieceTokenizer + +tokenizer = SentencePieceTokenizer(model_path='path/to/your/model.model') +text = "Hello, world!" +tokens = tokenizer.encode_infilling(text) +print(tokens) +# [284, 16, 250, 13, 849, 4] + +decoded_text = tokenizer.decode_infilling(tokens) +print(decoded_text) +# "Hello, world!" +``` + +### Example 3: + +```python +from zeta import SentencePieceTokenizer + +tokenizer = SentencePieceTokenizer(model_path='path/to/your/model.model') +tokens = [2, 284, 16, 250, 13, 849, 4, 3] +decoded_text = tokenizer.decode(tokens) +print(decoded_text) +# "Hello, world!" +``` + +## Additional Information + +- Make sure that the model file specified in `model_path` exists. +- The special tokens such as `
`, ``, ``, `` are optional and may not be present in all SentencePiece models.
+
+## References and Resources
+
+- [SentencePiece GitHub Repository](https://github.com/google/sentencepiece)
+- [SentencePiece: A simple and language independent subword tokenizer and detokenizer for Neural Text Generation](https://arxiv.org/abs/1808.06226)
+
+## Mathematical Formulation
+
+The SentencePiece model uses the following mathematical formula for tokenization:
+
+\[P(w) = \prod_{i=1}^{n} P(w_i | w_1, ..., w_{i-1})\]
+
+Where:
+- \(P(w)\) is the probability of the word \(w\).
+- \(n\) is the number of subwords in the word \(w\).
+- \(w_i\) is the \(i\)-th subword of \(w\).
+
+The model is trained to maximize the likelihood of the training data, and the subwords are chosen to minimize the perplexity of the training data.
\ No newline at end of file
diff --git a/docs/zeta/training/nebula.md b/docs/zeta/training/nebula.md
new file mode 100644
index 0000000..2d729a2
--- /dev/null
+++ b/docs/zeta/training/nebula.md
@@ -0,0 +1,138 @@
+# Nebula
+
+The `Nebula` class is a custom loss function class that dynamically determines the most suitable loss function for a given dataset based on certain characteristics of the dataset, such as sparsity, correlation, range of values, and user input. It is part of the `zeta` library and is built upon PyTorch's LossFunction class.
+
+## Introduction
+
+The purpose of the `Nebula` class is to help determine and cache the most suitable loss function for a given dataset without requiring the user to manually select one. This can be particularly useful in scenarios where the user is unsure of the most appropriate loss function to use or in automated systems where the type of problem (classification or regression) is not known a priori.
+
+The `Nebula` class considers various characteristics of the data, such as whether the target values are integers, the sparsity of the target values, the correlation between predictions and target values, and any user or domain knowledge provided, to determine whether the problem is a classification or regression problem and subsequently select an appropriate loss function.
+
+## Class Definition
+
+```python
+class Nebula(LossFunction):
+    def __init__(self, domain_knowledge=None, user_input=None):
+        ...
+```
+
+### Parameters
+
+- `domain_knowledge` (str, optional): Domain knowledge about the problem. It can be either "classification" or "regression". Default is `None`.
+- `user_input` (str, optional): User input about the problem type. It can be either "classification" or "regression". Default is `None`.
+
+### Attributes
+
+- `loss_function`: The determined loss function.
+- `domain_knowledge`: Domain knowledge provided during initialization.
+- `user_input`: User input provided during initialization.
+- `loss_function_cache`: A cache for storing the determined loss function for a dataset.
+- `unique_values_cache`: A cache for storing the unique values in the target variable `y_true`.
+- `class_balance_cache`: A cache for storing the class balance in the target variable `y_true`.
+- `logger`: A logger for logging information during the determination of the loss function.
+
+## Functionality and Usage
+
+The `Nebula` class is used to dynamically determine the most suitable loss function for a given dataset and cache the determined loss function for future use. The class analyzes the unique values, class balance, sparsity, and correlation of the target variable `y_true` and the predicted variable `y_pred` to determine whether the problem is a classification or regression problem and select an appropriate loss function.
+
+### Method: `determine_loss_function`
+
+```python
+def determine_loss_function(self, y_pred, y_true):
+    ...
+```
+
+This method determines the most suitable loss function based on the characteristics of `y_pred` and `y_true`.
+
+#### Parameters
+
+- `y_pred` (Tensor): The predicted values.
+- `y_true` (Tensor): The ground truth values.
+
+### Method: `__call__`
+
+```python
+def __call__(self, y_pred, y_true):
+    ...
+```
+
+This method computes the loss using the determined loss function.
+
+#### Parameters
+
+- `y_pred` (Tensor): The predicted values.
+- `y_true` (Tensor): The ground truth values.
+
+#### Returns
+
+- `Tensor`: The computed loss.
+
+### Usage Examples
+
+#### Example 1: Basic Usage
+
+```python
+from zeta import Nebula
+import torch
+
+# Initialize Nebula
+nebula = Nebula()
+
+# Generate some example data
+y_pred = torch.randn(10, 5)
+y_true = torch.randint(0, 5, (10,))
+
+# Compute the loss
+loss = nebula(y_pred, y_true)
+
+print(loss)
+```
+
+#### Example 2: Providing Domain Knowledge
+
+```python
+from zeta import Nebula
+import torch
+
+# Initialize Nebula with domain knowledge
+nebula = Nebula(domain_knowledge="classification")
+
+# Generate some example data
+y_pred = torch.randn(10, 5)
+y_true = torch.randint(0, 5, (10,))
+
+# Compute the loss
+loss = nebula(y_pred, y_true)
+
+print(loss)
+```
+
+#### Example 3: Providing User Input
+
+```python
+from zeta import Nebula
+import torch
+
+# Initialize Nebula with user input
+nebula = Nebula(user_input="regression")
+
+# Generate some example data
+y_pred = torch.randn(10, 1)
+y_true = torch.randn(10, 1)
+
+# Compute the loss
+loss = nebula(y_pred, y_true)
+
+print(loss)
+```
+
+## Mathematical Formula
+
+The `Nebula` class does not have a specific mathematical formula as it dynamically determines the most suitable loss function based on the characteristics of the data. However, the determined loss function will have its own mathematical formula, which can be found in the PyTorch documentation or the `zeta` library documentation.
+
+## Additional Information and Tips
+
+- The `Nebula` class caches the determined loss function, unique values, and class balance for a given dataset to avoid recomputing them in the future.
+- If both `domain_knowledge` and `user_input` are provided, `domain_knowledge` will take precedence over `user_input`.
+- The `Nebula` class uses the `logging` module to log information during the determination of the loss function. You can customize the logging settings by modifying the `logger` attribute.
+
diff --git a/docs/zeta/training/optimizers/decoupled_lion.md b/docs/zeta/training/optimizers/decoupled_lion.md
new file mode 100644
index 0000000..fc3329e
--- /dev/null
+++ b/docs/zeta/training/optimizers/decoupled_lion.md
@@ -0,0 +1,158 @@
+# DecoupledLionW Optimizer
+
+## Overview and Introduction
+
+`DecoupledLionW` is a PyTorch optimizer designed to improve training performance and convergence for deep learning models. It is an extension of the Lion optimizer, which incorporates decoupled weight decay and a momentum-based update rule. 
+
+The optimizer utilizes the Adam-like update rule, where the weight decay is applied separately from the gradient update. This is crucial as it helps prevent overfitting, improves generalization, and aids faster convergence and smoother optimization.
+
+### Key Concepts:
+
+- **Weight Decay:** Reduces the magnitude of the model's weights, preventing overfitting and improving generalization.
+- **Momentum Update:** An interpolation between the current gradient and the previous momentum state, allowing for faster convergence and smoother optimization.
+- **Momentum Decay:** Gradually reduces the momentum term over time, preventing it from becoming too large and destabilizing the optimization process.
+
+## Class Definition
+
+```python
+class DecoupledLionW(Optimizer):
+    def __init__(
+            self,
+            params,
+            lr: float = 1e-4,
+            betas: Tuple[float, float] = (0.9, 0.99),
+            weight_decay: float = 0.0,
+    ):
+```
+
+### Parameters
+
+- `params` (iterable): Iterable of parameters to optimize or dictionaries defining parameter groups.
+- `lr` (float, optional): Learning rate. Default: 1e-4.
+- `betas` (Tuple[float, float], optional): Coefficients used for computing running averages of gradient and its square. Default: (0.9, 0.99).
+- `weight_decay` (float, optional): Weight decay (L2 penalty). Default: 0.
+
+### Attributes
+
+- `metric_functions`: A dictionary of lambda functions to compute various metrics like L2 norm of moments, parameters, updates, and gradients, as well as cosine similarity between updates and gradients.
+
+## Functionality and Usage
+
+### `lionw` Method
+
+This static method is responsible for applying the weight decay, momentum update, and momentum decay.
+
+```python
+@staticmethod
+def lionw(p, grad, exp_avg, lr, initial_lr, wd, beta1, beta2) -> None:
+```
+
+#### Parameters
+
+- `p` (Tensor): Parameter tensor.
+- `grad` (Tensor): Gradient tensor.
+- `exp_avg` (Tensor): Exponential moving average of gradient values.
+- `lr` (float): Learning rate.
+- `initial_lr` (float): Initial learning rate.
+- `wd` (float): Weight decay.
+- `beta1` (float): Exponential decay rate for the first moment estimates.
+- `beta2` (float): Exponential decay rate for the second moment estimates.
+
+### `step` Method
+
+Performs a single optimization step.
+
+```python
+@torch.no_grad()
+def step(self, closure: Optional[Callable] = None):
+```
+
+#### Parameters
+
+- `closure` (callable, optional): A closure that reevaluates the model and returns the loss.
+
+#### Returns
+
+- `loss` (float, optional): The loss value if `closure` is provided. None otherwise.
+
+### `pre_reduce_metrics` Method
+
+This method preprocesses the metrics before reduction across nodes.
+
+```python
+def pre_reduce_metrics(self, optimizer_metrics):
+```
+
+#### Parameters
+
+- `optimizer_metrics` (dict): A dictionary containing the optimizer metrics.
+
+#### Returns
+
+- `optimizer_metrics` (dict): The pre-processed optimizer metrics.
+
+### `report_per_parameter_metrics` Method
+
+This method reports the per-parameter metrics.
+
+```python
+def report_per_parameter_metrics(self, param: torch.Tensor, name: str, optimizer_metrics: dict):
+```
+
+#### Parameters
+
+- `param` (Tensor): Parameter tensor.
+- `name` (str): Name of the parameter.
+- `optimizer_metrics` (dict): A dictionary containing the optimizer metrics.
+
+#### Returns
+
+- `optimizer_metrics` (dict): The optimizer metrics with the reported per-parameter metrics.
+
+## Usage Examples
+
+```python
+from zeta import x
+import torch
+
+# Define model parameters
+params = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
+
+# Define optimizer
+optimizer = DecoupledLionW(params, lr=0.1, betas=(0.9, 0.999), weight_decay=0.01)
+
+# Define loss function
+loss_fn = torch.nn.MSELoss()
+
+# Forward pass
+output = x(params)
+target = torch.tensor([0.0, 1.0, 2.0])
+loss = loss_fn(output, target)
+
+# Backward pass
+loss.backward()
+
+# Optimization step
+optimizer.step()
+```
+
+## Mathematical Formula
+
+The update rule of the optimizer can be represented by the following formula:
+
+\[ p = p - \alpha \cdot \text{sign}(\beta_1 \cdot m + (1-\beta_1) \cdot g) - \eta \cdot wd \]
+
+Where:
+
+- \( p \) is the parameter.
+- \( \alpha \) is the learning rate.
+- \( \beta_1 \) is the exponential decay rate for the first moment estimates.
+- \( m \) is the momentum (exponential moving average of gradient values).
+- \( g \) is the gradient.
+- \( \eta \) is the decay factor.
+- \( wd \) is the weight decay.
+
+## Additional Information and Tips
+
+- A high value of `weight_decay` can lead to a large reduction in the model's weights on every step. Ensure to use an appropriate value for your specific use case.
+- The optimizer supports both single-node and multi-node distributed training, enabling efficient training on parallel computing environments.
diff --git a/docs/zeta/training/optimizers/sophia.md b/docs/zeta/training/optimizers/sophia.md
new file mode 100644
index 0000000..298f3d8
--- /dev/null
+++ b/docs/zeta/training/optimizers/sophia.md
@@ -0,0 +1,108 @@
+# SophiaG Optimizer for Zeta Library
+
+## Overview
+
+The SophiaG optimizer is designed to adaptively change learning rates during training, offering a combination of momentum-based acceleration and second-order Hessian-based adaptive learning rates. This optimizer is particularly useful for training deep neural networks and optimizing complex, non-convex loss functions. Key features include:
+
+1. **Momentum**: Utilizes exponentially moving averages of gradients.
+2. **Adaptive Learning Rate**: Adjusts the learning rate based on the second-order Hessian information.
+3. **Regularization**: Applies weight decay to avoid overfitting.
+4. **Optional Settings**: Allows for maximizing the loss function, customizable settings for capturable and dynamic parameters.
+
+## Class Definition
+
+```python
+class SophiaG(Optimizer):
+    def __init__(self, params, lr=1e-4, betas=(0.965, 0.99), rho=0.04,
+                 weight_decay=1e-1, *, maximize: bool = False,
+                 capturable: bool = False, dynamic: bool = False):
+```
+
+### Parameters:
+
+- `params` (iterable): Iterable of parameters to optimize.
+- `lr` (float, default=1e-4): Learning rate.
+- `betas` (Tuple[float, float], default=(0.965, 0.99)): Coefficients used for computing running averages of gradient and Hessian.
+- `rho` (float, default=0.04): Damping factor for Hessian-based updates.
+- `weight_decay` (float, default=1e-1): Weight decay factor.
+- `maximize` (bool, default=False): Whether to maximize the loss function.
+- `capturable` (bool, default=False): Enable/Disable special capturing features.
+- `dynamic` (bool, default=False): Enable/Disable dynamic adjustments of the optimizer.
+
+## Usage and Functionality
+
+### 1. Initialization
+
+Upon initialization, the optimizer performs validation on its parameters and sets them as the default parameters for parameter groups.
+
+```python
+from zeta import SophiaG
+
+optimizer = SophiaG(model.parameters(), lr=0.01, betas=(0.9, 0.999), weight_decay=1e-4)
+```
+
+### 2. Step Forward
+
+The `.step()` method updates the model parameters. The function is decorated with `@torch.no_grad()` to avoid saving any more computation graphs for gradient computation.
+
+```python
+loss = criterion(output, target)
+loss.backward()
+optimizer.step()
+```
+
+### 3. Update Hessian and Exponential Average
+
+The optimizer has internal methods to update the Hessian and Exponential Moving Average (EMA) of the gradients, controlled by `betas`.
+
+### 4. SophiaG Function
+
+The core SophiaG function updates the parameters based on the gradient (`grad`), moving average (`exp_avg`), and Hessian (`hessian`). It uses the following update formula:
+
+\[ \text{param} = \text{param} - \text{lr} \times \left( \text{beta}_1 \times \text{exp_avg} + \frac{(1-\text{beta}_1) \times \text{grad}}{( \text{beta}_2 \times \text{hessian} + (1-\text{beta}_2) )^{\rho}} \right) \]
+
+## Usage Examples
+
+### 1. Basic Usage:
+
+```python
+from zeta import SophiaG
+import torch
+import torch.nn as nn
+
+model = nn.Linear(10, 1)
+optimizer = SophiaG(model.parameters(), lr=0.01)
+```
+
+### 2. Customizing Betas and Learning Rate:
+
+```python
+from zeta import SophiaG
+import torch
+
+optimizer = SophiaG(model.parameters(), lr=0.001, betas=(0.9, 0.999))
+```
+
+### 3. Using with Weight Decay:
+
+```python
+from zeta import SophiaG
+
+optimizer = SophiaG(model.parameters(), lr=0.01, weight_decay=1e-4)
+```
+
+## Additional Information and Tips
+
+- Make sure that the parameters passed are compatible with the model you are using.
+- To maximize the loss function (useful in adversarial training), set `maximize=True`.
+
+## Common Issues
+
+- If sparse gradients are involved, the SophiaG optimizer is not applicable.
+
+## References and Resources
+
+- [Adaptive Learning Rates](https://arxiv.org/pdf/1609.04747)
+- [Zeta Documentation](https://zeta.apac.ai)
+
+For further questions or issues, visit our [GitHub repository](https://github.com/kyegomez/zeta).
diff --git a/docs/zeta/training/train.md b/docs/zeta/training/train.md
new file mode 100644
index 0000000..d6ac0e7
--- /dev/null
+++ b/docs/zeta/training/train.md
@@ -0,0 +1,139 @@
+# Documentation for `Trainer` Module from Zeta Library
+
+---
+
+## Introduction
+
+The `Trainer` module from the Zeta library provides an easy-to-use, flexible, and scalable approach to training deep learning models. By abstracting away many of the lower-level details of training, including distributed training, gradient accumulation, and model checkpointing, `Trainer` allows developers to focus on the high-level aspects of model development and experimentation.
+
+This module also integrates seamlessly with the HuggingFace `Accelerator` to enable mixed precision training, GPU acceleration, and distributed training across multiple nodes or GPUs.
+
+---
+
+## `Trainer` Class Definition
+
+```python
+def Trainer(
+        gradient_accumulate_every: int = None, 
+        batch_size: int = None, 
+        seq_len: int = None,
+        entity_name: str = None,
+        model = None,
+        use_fsdp: bool = False,
+        use_activation_checkpointing: bool = False,
+        learning_rate = None,
+        seed = None,
+        use_pretokenized: bool = False,
+        resume_from_checkpoint = None,
+        checkpointing_steps = None,
+        output_dir = None,
+        weight_decay = None,
+        use_deepspeed = None
+    ):
+```
+
+### Parameters
+
+- `gradient_accumulate_every` (`int`, optional): Specifies how often to accumulate gradients. Default: `None`.
+- `batch_size` (`int`, optional): Specifies the batch size for training. Default: `None`.
+- `seq_len` (`int`, optional): Sequence length for model inputs. Default: `None`.
+- `entity_name` (`str`, optional): Name of the entity for logging purposes. Default: `None`.
+- `model`: The model to train. No default value.
+- `use_fsdp` (`bool`, optional): Whether or not to use Fully Sharded Data Parallelism (FSDP). Default: `False`.
+- `use_activation_checkpointing` (`bool`, optional): Use activation checkpointing to save memory during training. Default: `False`.
+- `learning_rate`: The learning rate for training. No default value.
+- `seed`: Random seed for reproducibility. No default value.
+- `use_pretokenized` (`bool`, optional): Whether to use pre-tokenized data. Default: `False`.
+- `resume_from_checkpoint`: Path to a checkpoint to resume training from. Default: `None`.
+- `checkpointing_steps`: How often to save model checkpoints. Default: `None`.
+- `output_dir`: Directory to save final trained model and checkpoints. Default: `None`.
+- `weight_decay`: Weight decay value for regularization. No default value.
+- `use_deepspeed`: Whether to use deepspeed for training optimization. Default: `None`.
+
+---
+
+## Functionality and Usage
+
+The primary function of the `Trainer` module is to handle the training process, including data loading, optimization, and model updates. It leverages HuggingFace's `Accelerator` to provide accelerated training on GPUs and distributed environments.
+
+Here are the primary steps:
+
+1. Initialization of the `Accelerator` for GPU training and gradient accumulation.
+2. Model and optimizer initialization.
+3. Loading datasets and setting up data loaders.
+4. Training loop with gradient accumulation and model checkpointing.
+5. Save the final trained model.
+
+### Code Examples
+
+**1. Basic Usage**
+
+```python
+from zeta import Trainer
+
+model = ... # Your model definition here
+Trainer(
+    gradient_accumulate_every=2,
+    batch_size=32,
+    seq_len=128,
+    model=model,
+    learning_rate=0.001,
+    seed=42,
+    output_dir='./models/'
+)
+```
+
+**2. Resuming Training from a Checkpoint**
+
+```python
+from zeta import Trainer
+
+model = ... # Your model definition here
+Trainer(
+    gradient_accumulate_every=2,
+    batch_size=32,
+    seq_len=128,
+    model=model,
+    learning_rate=0.001,
+    seed=42,
+    resume_from_checkpoint='./models/checkpoint.pt',
+    output_dir='./models/'
+)
+```
+
+**3. Using FSDP and Activation Checkpointing**
+
+```python
+from zeta import Trainer
+
+model = ... # Your model definition here
+Trainer(
+    gradient_accumulate_every=2,
+    batch_size=32,
+    seq_len=128,
+    model=model,
+    use_fsdp=True,
+    use_activation_checkpointing=True,
+    learning_rate=0.001,
+    seed=42,
+    output_dir='./models/'
+)
+```
+
+---
+
+## Mathematical Description
+
+Given a dataset \( D \) consisting of data points \( \{ (x_1, y_1), (x_2, y_2), ... (x_N, y_N) \} \), the trainer aims to minimize the loss function \( L \) with respect to model parameters \( \theta \):
+
+\[ \theta^* = \arg\min_{\theta} \frac{1}{N} \sum_{i=1}^{N} L(f(x_i; \theta), y_i) \]
+
+
+
+where \( f \) is the model's prediction function.
+
+---
+
+## Conclusions
+
+The `Trainer` module from Zeta library streamlines the training process by abstracting away many complexities, making it a valuable tool for developers at all experience levels. Whether you are training a simple model or a complex architecture in a distributed environment, the `Trainer` module offers the flexibility and ease-of-use to get your models trained efficiently.
\ No newline at end of file
diff --git a/example.py b/example.py
new file mode 100644
index 0000000..e69de29
diff --git a/mkdocs.yml b/mkdocs.yml
new file mode 100644
index 0000000..08107b0
--- /dev/null
+++ b/mkdocs.yml
@@ -0,0 +1,124 @@
+site_name: Package Docs
+plugins:
+  - glightbox
+  - search
+copyright: "© APAC Corp, Inc."
+extra_css:
+  - docs/assets/css/extra.css
+extra:
+  # analytics:
+  #   provider: google
+  #   property: G-QM8EDPSCB6
+  social:
+    - icon: fontawesome/solid/house
+      link: assets/img/ZetaLogoIcon.png
+    - icon: fontawesome/brands/discord
+      link: https://discord.gg/qUtxnK2NMf
+    - icon: fontawesome/brands/github
+      link: https://github.com/kyegomez/Zeta/
+    - icon: fontawesome/brands/python
+      link: https://pypi.org/project/Zeta/
+theme:
+    name: material
+    custom_dir: docs/overrides
+    logo: assets/img/ZetaLogoIcon.png
+    palette:
+      # Palette toggle for light mode
+    - scheme: default
+      primary: 'custom'
+      toggle:
+        icon: material/brightness-7 
+        name: Switch to dark mode
+    # Palette toggle for dark mode
+    - scheme: slate
+      primary: 'custom'
+      accent: light blue
+      toggle:
+        icon: material/brightness-4
+        name: Switch to light mode
+    features:
+        - content.code.copy
+        - content.code.annotate
+        - navigation.tabs
+        - navigation.sections
+        - navigation.expand
+        - navigation.top
+        - announce.dismiss
+    font:
+      text: Roboto
+      code: Roboto Mono
+
+extra_css:
+  - stylesheets/extra.css
+
+markdown_extensions:
+  - pymdownx.highlight:
+      anchor_linenums: true
+      line_spans: __span
+      pygments_lang_class: true
+  - admonition
+  - pymdownx.inlinehilite
+  - pymdownx.snippets
+  - pymdownx.superfences
+  - pymdownx.details
+  - pymdownx.tabbed
+  - tables
+  - def_list
+  - footnotes
+
+
+nav:
+- Home:
+    - Overview: "index.md"
+    - Contributing: "contributing.md"
+    - FAQ: "faq.md"
+    - Purpose: "purpose.md"
+    - Roadmap: "roadmap.md"
+    - Design: "design.md"
+    - Flywheel: "flywheel.md"
+    - Bounties: "bounties.md"
+    - Metric: "metric.md"
+    - Distribution: "distribution"
+    - Research: "research.md"
+    - Demos: "demos.md"
+    - Architecture: "architecture.md"
+    - Checklist: "checklist.md"
+    - Hiring: "hiring.md"
+- Zeta:
+    - Overview: "zeta/index.md"
+    - zeta.nn:
+      - zeta.nn.biases: 
+        - Xpos: "zeta/nn/biases/xpos.md"
+        - RelativePositionBias: "zeta/nn/biases/relative_bias.md"
+        - AlibiPositionalBias: "zeta/nn/biases/alibi.md"
+      - zeta.nn.embeddings:
+        - MultiWay: "zeta/nn/embeddings/multiway.md"
+        - RotaryEmbeddings: "zeta/nn/embeddings/rope.md"
+        - TruncatedRotaryEmbedding: "zeta/nn/embeddings/truncated_rope.md"
+      - zeta.nn.modules:
+        - Lora: "zeta/nn/modules/lora.md"
+        - TokenLearner: "zeta/nn/modules/token_learner.md"
+      - zeta.nn.attention:
+        - FlashAttention: "zeta/nn/attention/flash_attention.md"
+        - MultiQueryAttention: "zeta/nn/attention/multiquery.md"
+        - MultiheadAttention: "zeta/nn/attention/multihead.md"
+        - FlashAttentionTwo: "zeta/nn/attention/flash2.md"
+        - BaseAttention: "zeta/nn/attention/base.md"
+      - zeta.nn.architecture:
+        - Decoder: "zeta/nn/architecture/decoder.md"
+        - Transformer: "zeta/nn/architecture/transformer.md"
+    - zeta.training:
+      - train: "zeta/training/train.md"
+      - zeta.training.loss:
+        - Nebula: "zeta/training/nebula.md"
+      - zeta.training.optimizers:
+        - DecoupledLionW: "zeta/training/optimizers/decoupled_lion.md"
+        - SophiaG: "zeta/training/optimizers/sophia.md"
+    - zeta.tokenizers:
+        - MultiModalTokenizer: "zeta/tokenizers/multi_modal_tokenizer.md"
+        - LanguageTokenizerGPTX: "zeta/tokenizers/language_tokenizer.md"
+        - SentencePieceTokenizer: "zeta/tokenizers/sentencepiece.md"
+- Examples:
+    - Overview: "examples/index.md"
+    - FlashAttention: "examples/nn/attentions/flash.md"
+    
\ No newline at end of file
diff --git a/package/__init__.py b/package/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/package/main.py b/package/main.py
new file mode 100644
index 0000000..e69de29
diff --git a/package/subfolder/__init__.py b/package/subfolder/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/package/subfolder/main.py b/package/subfolder/main.py
new file mode 100644
index 0000000..e69de29
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..5d4ac8e
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,57 @@
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.poetry]
+name = "paper"
+version = "0.0.1"
+description = "Paper - Pytorch"
+license = "MIT"
+authors = ["Kye Gomez "]
+homepage = "https://github.com/kyegomez/paper"
+documentation = "https://github.com/kyegomez/paper"  # Add this if you have documentation.
+readme = "README.md"  # Assuming you have a README.md
+repository = "https://github.com/kyegomez/paper"
+keywords = ["artificial intelligence", "deep learning", "optimizers", "Prompt Engineering"]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3.9"
+]
+
+[tool.poetry.dependencies]
+python = "^3.6"
+swarms = "*"
+zetascale = "*"
+
+[tool.poetry.dev-dependencies]
+# Add development dependencies here
+
+
+[tool.poetry.group.lint.dependencies]
+ruff = "^0.1.6"
+types-toml = "^0.10.8.1"
+types-redis = "^4.3.21.6"
+types-pytz = "^2023.3.0.0"
+black = "^23.1.0"
+types-chardet = "^5.0.4.6"
+mypy-protobuf = "^3.0.0"
+
+
+[tool.autopep8]
+max_line_length = 80
+ignore = "E501,W6"  # or ["E501", "W6"]
+in-place = true
+recursive = true
+aggressive = 3
+
+
+[tool.ruff]
+line-length = 70
+
+[tool.black]
+line-length = 70
+target-version = ['py38']
+preview = true
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..236a195
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+torch
+zetascale
+swarms
diff --git a/scripts/code_quality.sh b/scripts/code_quality.sh
new file mode 100755
index 0000000..dcebf05
--- /dev/null
+++ b/scripts/code_quality.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+# Navigate to the directory containing the 'package' folder
+# cd /path/to/your/code/directory
+
+# Run autopep8 with max aggressiveness (-aaa) and in-place modification (-i)
+# on all Python files (*.py) under the 'package' directory.
+autopep8 --in-place --aggressive --aggressive --recursive --experimental --list-fixes package/
+
+# Run black with default settings, since black does not have an aggressiveness level.
+# Black will format all Python files it finds in the 'package' directory.
+black --experimental-string-processing package/
+
+# Run ruff on the 'package' directory.
+# Add any additional flags if needed according to your version of ruff.
+ruff --unsafe_fix
+
+# YAPF
+yapf --recursive --in-place --verbose --style=google --parallel package
diff --git a/scripts/merge_all_prs.sh b/scripts/merge_all_prs.sh
new file mode 100755
index 0000000..1135823
--- /dev/null
+++ b/scripts/merge_all_prs.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+# Check if we are inside a Git repository
+if ! git rev-parse --git-dir > /dev/null 2>&1; then
+    echo "Error: Must be run inside a Git repository."
+    exit 1
+fi
+
+# Fetch all open pull requests
+echo "Fetching open PRs..."
+prs=$(gh pr list --state open --json number --jq '.[].number')
+
+# Check if there are PRs to merge
+if [ -z "$prs" ]; then
+    echo "No open PRs to merge."
+    exit 0
+fi
+
+echo "Found PRs: $prs"
+
+# Loop through each pull request number and merge it
+for pr in $prs; do
+    echo "Attempting to merge PR #$pr"
+    merge_output=$(gh pr merge $pr --auto --merge)
+    merge_status=$?
+    if [ $merge_status -ne 0 ]; then
+        echo "Failed to merge PR #$pr. Error: $merge_output"
+    else
+        echo "Successfully merged PR #$pr"
+    fi
+done
+
+echo "Processing complete."
diff --git a/scripts/test_name.sh b/scripts/test_name.sh
new file mode 100755
index 0000000..cdc6a01
--- /dev/null
+++ b/scripts/test_name.sh
@@ -0,0 +1,8 @@
+find ./tests -name "*.py" -type f | while read file
+do
+  filename=$(basename "$file")
+  dir=$(dirname "$file")
+  if [[ $filename != test_* ]]; then
+    mv "$file" "$dir/test_$filename"
+  fi
+done
\ No newline at end of file
diff --git a/scripts/tests.sh b/scripts/tests.sh
new file mode 100644
index 0000000..13f4111
--- /dev/null
+++ b/scripts/tests.sh
@@ -0,0 +1 @@
+find ./tests -name '*.py' -exec pytest {} \;
\ No newline at end of file