Update Readme and minor fix

shumingma · shumingma · commit eb89ab69ec66 · 2023-12-20T02:19:44.000-08:00
diff --git a/README.md b/README.md
@@ -38,6 +38,18 @@ cd torchscale
 pip install -e .
 ```
 
+For faster training install [Flash Attention](https://github.com/Dao-AILab/flash-attention) for Turing, Ampere, Ada, or Hopper GPUs:
+```
+pip install flash-attn
+```
+or [xFormers](https://github.com/facebookresearch/xformers) for Volta, Turing, Ampere, Ada, or Hopper GPUs:
+```
+# cuda 11.8 version
+pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu118
+# cuda 12.1 version
+pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu121
+```
+
 ## Getting Started
 
 It takes only several lines of code to create a model with the above fundamental research features enabled. Here is how to quickly obtain a BERT-like encoder:
@@ -86,6 +98,21 @@ It takes only several lines of code to create a RetNet model:
 >>> print(retnet)
 ```
 
+For LongNet models ([Flash Attention](https://github.com/Dao-AILab/flash-attention) required):
+```python
+>>> import torch
+>>> from torchscale.architecture.config import EncoderConfig, DecoderConfig
+>>> from torchscale.model.longnet import LongNetEncoder, LongNetDecoder
+
+# Creating a LongNet encoder with the dilated pattern of segment_length=[2048,4096] and dilated_ratio=[1,2]
+>>> config = EncoderConfig(vocab_size=64000, segment_length='[2048,4096]', dilated_ratio='[1,2]', flash_attention=True)
+>>> longnet = LongNetEncoder(config)
+
+# Creating a LongNet decoder with the dilated pattern of segment_length=[2048,4096] and dilated_ratio=[1,2]
+>>> config = DecoderConfig(vocab_size=64000, segment_length='[2048,4096]', dilated_ratio='[1,2]', flash_attention=True)
+>>> longnet = LongNetDecoder(config)
+```
+
 ## Key Features
 
 - [DeepNorm to improve the training stability of Post-LayerNorm Transformers](https://arxiv.org/abs/2203.00555)
@@ -231,6 +258,24 @@ If you find this repository useful, please consider citing our work:
 }
 ```
 
+```
+@article{longnet,
+  author={Jiayu Ding and Shuming Ma and Li Dong and Xingxing Zhang and Shaohan Huang and Wenhui Wang and Nanning Zheng and Furu Wei},
+  title     = {{LongNet}: Scaling Transformers to 1,000,000,000 Tokens},
+  journal   = {ArXiv},
+  volume    = {abs/2307.02486},
+  year      = {2023}
+}
+```
+
+@article{longvit,
+  title     = {When an Image is Worth 1,024 x 1,024 Words: A Case Study in Computational Pathology},
+  author    = {Wenhui Wang and Shuming Ma and Hanwen Xu and Naoto Usuyama and Jiayu Ding and Hoifung Poon and Furu Wei},
+  journal   = {ArXiv},
+  volume    = {abs/2312.03558},
+  year      = {2023}
+}
+
 ## Contributing
 
 This project welcomes contributions and suggestions.  Most contributions require you to agree to a
diff --git a/setup.py b/setup.py
@@ -17,7 +17,7 @@
     license="MIT",
     url="https://github.com/microsoft/torchscale",
     packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
-    install_requires=["torch>=1.8", "fairscale==0.4.0", "timm==0.6.13"],
+    install_requires=["torch>=1.8", "fairscale==0.4.0", "timm==0.6.13", "einops"],
     python_requires=">=3.8.0",
     classifiers=[
         "Programming Language :: Python :: 3",
diff --git a/torchscale/component/flash_attention.py b/torchscale/component/flash_attention.py
@@ -113,7 +113,7 @@ def backward(cls, ctx, grad, dlse):
                 grads = _memory_efficient_attention_backward(
                     ctx=op_ctx, inp=inp, grad=grad, op=ctx.op_bw
                 )
-                return grads.dq, grads.dk, grads.dv, grads.db, None, None, None
+                return grads.dq, grads.dk, grads.dv, None, grads.db, None, None
         
         flash_attn_func = FlashAttnFunc.apply
 except ModuleNotFoundError:

Original file line number	Diff line number	Diff line change
`@@ -113,7 +113,7 @@ def backward(cls, ctx, grad, dlse):`
`113`	`113`	`grads = _memory_efficient_attention_backward(`
`114`	`114`	`ctx=op_ctx, inp=inp, grad=grad, op=ctx.op_bw`
`115`	`115`	`)`
`116`		`- return grads.dq, grads.dk, grads.dv, grads.db, None, None, None`
	`116`	`+ return grads.dq, grads.dk, grads.dv, None, grads.db, None, None`
`117`	`117`
`118`	`118`	`flash_attn_func = FlashAttnFunc.apply`
`119`	`119`	`except ModuleNotFoundError:`