From 4a65b5752c2023271eb11822802a167022b01b89 Mon Sep 17 00:00:00 2001
From: "promptless[bot]" <179508745+promptless[bot]@users.noreply.github.com>
Date: Wed, 6 Nov 2024 21:59:31 +0000
Subject: [PATCH] Docs update (0252da9)

---
 docs/source/en/chat_templating.md          | 472 ++++++++++-----------
 docs/source/en/conversations.md            |  48 ++-
 docs/source/en/model_doc/idefics2.md       |  59 ++-
 docs/source/en/model_doc/llava_next.md     | 126 +++---
 docs/source/en/model_doc/mllama.md         |  44 +-
 docs/source/en/model_summary.md            |   3 +
 docs/source/en/pipeline_tutorial.md        |  69 ++-
 docs/source/en/tasks/image_text_to_text.md |  76 ++--
 docs/source/en/tasks/video_text_to_text.md |  28 +-
 docs/source/es/chat_templating.md          | 101 ++---
 docs/source/fr/quicktour.md                | 124 +++---
 docs/source/ja/pipeline_tutorial.md        |  36 +-
 docs/source/ko/pipeline_tutorial.md        |  76 +++-
 13 files changed, 735 insertions(+), 527 deletions(-)
diff --git a/docs/source/en/chat_templating.md b/docs/source/en/chat_templating.md
index 1bdf05a26c8d..db6054abf812 100644
--- a/docs/source/en/chat_templating.md
+++ b/docs/source/en/chat_templating.md
@@ -62,7 +62,7 @@ with totally different chat formats. Without chat templates, you would have to w
 model, and it's very easy to make minor errors that hurt performance! Chat templates handle the details of formatting 
 for you, allowing you to write universal code that works for any model.
 
-
+With the introduction of the `ImageTextToTextPipeline`, chat templates can now also handle multi-modal inputs, where messages can include both text and images. This allows for more complex interactions, such as visual question answering or image-based text generation, using the same chat templating system.
 ## How do I use chat templates?
 
 As you can see in the example above, chat templates are easy to use. Simply build a list of messages, with `role`
@@ -80,28 +80,28 @@ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 model = AutoModelForCausalLM.from_pretrained(checkpoint)  # You may want to use bfloat16 and/or move to GPU here
 
 messages = [
-    {
-        "role": "system",
-        "content": "You are a friendly chatbot who always responds in the style of a pirate",
-    },
-    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
- ]
+{
+"role": "system",
+"content": "You are a friendly chatbot who always responds in the style of a pirate",
+},
+{"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
+]
 tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
 print(tokenizer.decode(tokenized_chat[0]))
 ```
 This will yield a string in the input format that Zephyr expects. 
 ```text
 <|system|>
-You are a friendly chatbot who always responds in the style of a pirate</s> 
+You are a friendly chatbot who always responds in the style of a pirate</s>
 <|user|>
-How many helicopters can a human eat in one sitting?</s> 
+How many helicopters can a human eat in one sitting?</s>
 <|assistant|>
 ```
 
 Now that our input is formatted correctly for Zephyr, we can use the model to generate a response to the user's question:
 
 ```python
-outputs = model.generate(tokenized_chat, max_new_tokens=128) 
+outputs = model.generate(tokenized_chat, max_new_tokens=128)
 print(tokenizer.decode(outputs[0]))
 ```
 
@@ -109,9 +109,9 @@ This will yield:
 
 ```text
 <|system|>
-You are a friendly chatbot who always responds in the style of a pirate</s> 
+You are a friendly chatbot who always responds in the style of a pirate</s>
 <|user|>
-How many helicopters can a human eat in one sitting?</s> 
+How many helicopters can a human eat in one sitting?</s>
 <|assistant|>
 Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopters are not food, they are flying machines. Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. But helicopters, they be for transportin' and movin' around, not for eatin'. So, I'd say none, me hearties. None at all.
 ```
@@ -130,11 +130,11 @@ from transformers import pipeline
 
 pipe = pipeline("text-generation", "HuggingFaceH4/zephyr-7b-beta")
 messages = [
-    {
-        "role": "system",
-        "content": "You are a friendly chatbot who always responds in the style of a pirate",
-    },
-    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
+{
+"role": "system",
+"content": "You are a friendly chatbot who always responds in the style of a pirate",
+},
+{"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
 ]
 print(pipe(messages, max_new_tokens=128)[0]['generated_text'][-1])  # Print the assistant's response
 ```
@@ -153,9 +153,9 @@ the template to add tokens that indicate the start of a bot response. For exampl
 
 ```python
 messages = [
-    {"role": "user", "content": "Hi there!"},
-    {"role": "assistant", "content": "Nice to meet you!"},
-    {"role": "user", "content": "Can I ask a question?"}
+{"role": "user", "content": "Hi there!"},
+{"role": "assistant", "content": "Nice to meet you!"},
+{"role": "user", "content": "Can I ask a question?"}
 ]
 ```
 
@@ -198,48 +198,32 @@ effect that `add_generation_prompt` has will depend on the template being used.
 
 ## What does "continue_final_message" do?
 
-When passing a list of messages to `apply_chat_template` or `TextGenerationPipeline`, you can choose
-to format the chat so the model will continue the final message in the chat instead of starting a new one. This is done
-by removing any end-of-sequence tokens that indicate the end of the final message, so that the model will simply
-extend the final message when it begins to generate text. This is useful for "prefilling" the model's response. 
+When passing a list of messages to `apply_chat_template`, `TextGenerationPipeline`, or `ImageTextToTextPipeline`, you can choose to format the chat so the model will continue the final message in the chat instead of starting a new one. This is done by removing any end-of-sequence tokens that indicate the end of the final message, so that the model will simply extend the final message when it begins to generate text. This is useful for "prefilling" the model's response.
 
 Here's an example:
 
 ```python
 chat = [
-    {"role": "user", "content": "Can you format the answer in JSON?"},
-    {"role": "assistant", "content": '{"name": "'},
+{"role": "user", "content": "Can you format the answer in JSON?"},
+{"role": "assistant", "content": '{"name": "'},
 ]
 
 formatted_chat = tokenizer.apply_chat_template(chat, tokenize=True, return_dict=True, continue_final_message=True)
 model.generate(**formatted_chat)
 ```
 
-The model will generate text that continues the JSON string, rather than starting a new message. This approach
-can be very useful for improving the accuracy of the model's instruction-following when you know how you want
-it to start its replies.
+The model will generate text that continues the JSON string, rather than starting a new message. This approach can be very useful for improving the accuracy of the model's instruction-following when you know how you want it to start its replies.
 
-Because `add_generation_prompt` adds the tokens that start a new message, and `continue_final_message` removes any
-end-of-message tokens from the final message, it does not make sense to use them together. As a result, you'll
-get an error if you try!
+Because `add_generation_prompt` adds the tokens that start a new message, and `continue_final_message` removes any end-of-message tokens from the final message, it does not make sense to use them together. As a result, you'll get an error if you try!
 
 <Tip>
 
-The default behaviour of `TextGenerationPipeline` is to set `add_generation_prompt=True` so that it starts a new
-message. However, if the final message in the input chat has the "assistant" role, it will assume that this message is 
-a prefill and switch to `continue_final_message=True` instead, because most models do not support multiple 
-consecutive assistant messages. You can override this behaviour by explicitly passing the `continue_final_message` 
-argument when calling the pipeline.
+The default behaviour of `TextGenerationPipeline` and `ImageTextToTextPipeline` is to set `add_generation_prompt=True` so that it starts a new message. However, if the final message in the input chat has the "assistant" role, it will assume that this message is a prefill and switch to `continue_final_message=True` instead, because most models do not support multiple consecutive assistant messages. You can override this behaviour by explicitly passing the `continue_final_message` argument when calling the pipeline.
 
 </Tip>
-
 ## Can I use chat templates in training?
 
-Yes! This is a good way to ensure that the chat template matches the tokens the model sees during training.
-We recommend that you apply the chat template as a preprocessing step for your dataset. After this, you
-can simply continue like any other language model training task. When training, you should usually set 
-`add_generation_prompt=False`, because the added tokens to prompt an assistant response will not be helpful during 
-training. Let's see an example:
+Yes! This is a good way to ensure that the chat template matches the tokens the model sees during training. We recommend that you apply the chat template as a preprocessing step for your dataset. After this, you can simply continue like any other language model training task. When training, you should usually set `add_generation_prompt=False`, because the added tokens to prompt an assistant response will not be helpful during training. Let's see an example:
 
 ```python
 from transformers import AutoTokenizer
@@ -248,12 +232,12 @@ from datasets import Dataset
 tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
 
 chat1 = [
-    {"role": "user", "content": "Which is bigger, the moon or the sun?"},
-    {"role": "assistant", "content": "The sun."}
+{"role": "user", "content": "Which is bigger, the moon or the sun?"},
+{"role": "assistant", "content": "The sun."}
 ]
 chat2 = [
-    {"role": "user", "content": "Which is bigger, a virus or a bacterium?"},
-    {"role": "assistant", "content": "A bacterium."}
+{"role": "user", "content": "Which is bigger, a virus or a bacterium?"},
+{"role": "assistant", "content": "A bacterium."}
 ]
 
 dataset = Dataset.from_dict({"chat": [chat1, chat2]})
@@ -272,15 +256,17 @@ From here, just continue training like you would with a standard language modell
 
 <Tip>
 
-By default, some tokenizers add special tokens like `<bos>` and `<eos>` to text they tokenize. Chat templates should 
-already include all the special tokens they need, and so additional special tokens will often be incorrect or 
-duplicated, which will hurt model performance.
+By default, some tokenizers add special tokens like `<bos>` and `<eos>` to text they tokenize. Chat templates should already include all the special tokens they need, and so additional special tokens will often be incorrect or duplicated, which will hurt model performance.
 
-Therefore, if you format text with `apply_chat_template(tokenize=False)`, you should set the argument
-`add_special_tokens=False` when you tokenize that text later. If you use `apply_chat_template(tokenize=True)`, you don't need to worry about this!
+Therefore, if you format text with `apply_chat_template(tokenize=False)`, you should set the argument `add_special_tokens=False` when you tokenize that text later. If you use `apply_chat_template(tokenize=True)`, you don't need to worry about this!
 
 </Tip>
 
+<Tip>
+
+If you are using the new `ImageTextToTextPipeline`, ensure that your chat templates are compatible with both text and image inputs. This will help maintain consistency in the tokens seen by the model during training and inference.
+
+</Tip>
 ## Advanced: Extra inputs to chat templates
 
 The only argument that `apply_chat_template` requires is `messages`. However, you can pass any keyword
@@ -303,24 +289,24 @@ to a tool-use model, you can simply pass a list of functions to the `tools` argu
 import datetime
 
 def current_time():
-    """Get the current local time as a string."""
-    return str(datetime.now())
+"""Get the current local time as a string."""
+return str(datetime.now())
 
 def multiply(a: float, b: float):
-    """
-    A function that multiplies two numbers
-    
-    Args:
-        a: The first number to multiply
-        b: The second number to multiply
-    """
-    return a * b
+"""
+A function that multiplies two numbers
+
+Args:
+a: The first number to multiply
+b: The second number to multiply
+"""
+return a * b
 
 tools = [current_time, multiply]
 
 model_input = tokenizer.apply_chat_template(
-    messages,
-    tools=tools
+messages,
+tools=tools
 )
 ```
 
@@ -370,27 +356,27 @@ Next, let's define a list of tools:
 
 ```python
 def get_current_temperature(location: str, unit: str) -> float:
-    """
-    Get the current temperature at a location.
-    
-    Args:
-        location: The location to get the temperature for, in the format "City, Country"
-        unit: The unit to return the temperature in. (choices: ["celsius", "fahrenheit"])
-    Returns:
-        The current temperature at the specified location in the specified units, as a float.
-    """
-    return 22.  # A real function should probably actually get the temperature!
+"""
+Get the current temperature at a location.
+
+Args:
+location: The location to get the temperature for, in the format "City, Country"
+unit: The unit to return the temperature in. (choices: ["celsius", "fahrenheit"])
+Returns:
+The current temperature at the specified location in the specified units, as a float.
+"""
+return 22.  # A real function should probably actually get the temperature!
 
 def get_current_wind_speed(location: str) -> float:
-    """
-    Get the current wind speed in km/h at a given location.
-    
-    Args:
-        location: The location to get the temperature for, in the format "City, Country"
-    Returns:
-        The current wind speed at the given location in km/h, as a float.
-    """
-    return 6.  # A real function should probably actually get the wind speed!
+"""
+Get the current wind speed in km/h at a given location.
+
+Args:
+location: The location to get the temperature for, in the format "City, Country"
+Returns:
+The current wind speed at the given location in km/h, as a float.
+"""
+return 6.  # A real function should probably actually get the wind speed!
 
 tools = [get_current_temperature, get_current_wind_speed]
 ```
@@ -399,8 +385,8 @@ Now, let's set up a conversation for our bot:
 
 ```python
 messages = [
-  {"role": "system", "content": "You are a bot that responds to weather queries. You should reply with the unit used in the queried location."},
-  {"role": "user", "content": "Hey, what's the temperature in Paris right now?"}
+{"role": "system", "content": "You are a bot that responds to weather queries. You should reply with the unit used in the queried location."},
+{"role": "user", "content": "Hey, what's the temperature in Paris right now?"}
 ]
 ```
 
@@ -446,6 +432,21 @@ messages.append({"role": "assistant", "tool_calls": [{"type": "function", "funct
 If you're familiar with the OpenAI API, you should pay attention to an important difference here - the `tool_call` is
 a dict, but in the OpenAI API it's a JSON string. Passing a string may cause errors or strange model behaviour!
 
+</Tip>
+</Tip>
+
+Next, let's append the model's tool call to the conversation.
+
+```python
+tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
+messages.append({"role": "assistant", "tool_calls": [{"type": "function", "function": tool_call}]})
+```
+
+<Tip warning={true}>
+
+If you're familiar with the OpenAI API, you should pay attention to an important difference here - the `tool_call` is
+a dict, but in the OpenAI API it's a JSON string. Passing a string may cause errors or strange model behaviour!
+
 </Tip>
 
 Now that we've added the tool call to the conversation, we can call the function and append the result to the
@@ -495,7 +496,6 @@ The current temperature in Paris, France is 22.0 ° Celsius.<|im_end|>
 Although this was a simple demo with dummy tools and a single call, the same technique works with 
 multiple real tools and longer conversations. This can be a powerful way to extend the capabilities of conversational
 agents with real-time information, computational tools like calculators, or access to large databases.
-
 ### Understanding tool schemas
 
 Each function you pass to the `tools` argument of `apply_chat_template` is converted into a 
@@ -514,14 +514,14 @@ you can handle the conversion manually. Here is an example of a manual schema co
 from transformers.utils import get_json_schema
 
 def multiply(a: float, b: float):
-    """
-    A function that multiplies two numbers
-    
-    Args:
-        a: The first number to multiply
-        b: The second number to multiply
-    """
-    return a * b
+"""
+A function that multiplies two numbers
+
+Args:
+a: The first number to multiply
+b: The second number to multiply
+"""
+return a * b
 
 schema = get_json_schema(multiply)
 print(schema)
@@ -531,25 +531,25 @@ This will yield:
 
 ```json
 {
-  "type": "function", 
-  "function": {
-    "name": "multiply", 
-    "description": "A function that multiplies two numbers", 
-    "parameters": {
-      "type": "object", 
-      "properties": {
-        "a": {
-          "type": "number", 
-          "description": "The first number to multiply"
-        }, 
-        "b": {
-          "type": "number",
-          "description": "The second number to multiply"
-        }
-      }, 
-      "required": ["a", "b"]
-    }
-  }
+"type": "function",
+"function": {
+"name": "multiply",
+"description": "A function that multiplies two numbers",
+"parameters": {
+"type": "object",
+"properties": {
+"a": {
+"type": "number",
+"description": "The first number to multiply"
+},
+"b": {
+"type": "number",
+"description": "The second number to multiply"
+}
+},
+"required": ["a", "b"]
+}
+}
 }
 ```
 
@@ -565,42 +565,42 @@ Here is an example of defining schemas by hand, and passing them directly to `ap
 ```python
 # A simple function that takes no arguments
 current_time = {
-  "type": "function", 
-  "function": {
-    "name": "current_time",
-    "description": "Get the current local time as a string.",
-    "parameters": {
-      'type': 'object',
-      'properties': {}
-    }
-  }
+"type": "function",
+"function": {
+"name": "current_time",
+"description": "Get the current local time as a string.",
+"parameters": {
+'type': 'object',
+'properties': {}
+}
+}
 }
 
 # A more complete function that takes two numerical arguments
 multiply = {
-  'type': 'function',
-  'function': {
-    'name': 'multiply',
-    'description': 'A function that multiplies two numbers', 
-    'parameters': {
-      'type': 'object', 
-      'properties': {
-        'a': {
-          'type': 'number',
-          'description': 'The first number to multiply'
-        }, 
-        'b': {
-          'type': 'number', 'description': 'The second number to multiply'
-        }
-      }, 
-      'required': ['a', 'b']
-    }
-  }
+'type': 'function',
+'function': {
+'name': 'multiply',
+'description': 'A function that multiplies two numbers',
+'parameters': {
+'type': 'object',
+'properties': {
+'a': {
+'type': 'number',
+'description': 'The first number to multiply'
+},
+'b': {
+'type': 'number', 'description': 'The second number to multiply'
+}
+},
+'required': ['a', 'b']
+}
+}
 }
 
 model_input = tokenizer.apply_chat_template(
-    messages,
-    tools = [current_time, multiply]
+messages,
+tools = [current_time, multiply]
 )
 ```
 
@@ -626,37 +626,37 @@ device = model.device # Get the device the model is loaded on
 
 # Define conversation input
 conversation = [
-    {"role": "user", "content": "What has Man always dreamed of?"}
+{"role": "user", "content": "What has Man always dreamed of?"}
 ]
 
 # Define documents for retrieval-based generation
 documents = [
-    {
-        "title": "The Moon: Our Age-Old Foe", 
-        "text": "Man has always dreamed of destroying the moon. In this essay, I shall..."
-    },
-    {
-        "title": "The Sun: Our Age-Old Friend",
-        "text": "Although often underappreciated, the sun provides several notable benefits..."
-    }
+{
+"title": "The Moon: Our Age-Old Foe",
+"text": "Man has always dreamed of destroying the moon. In this essay, I shall..."
+},
+{
+"title": "The Sun: Our Age-Old Friend",
+"text": "Although often underappreciated, the sun provides several notable benefits..."
+}
 ]
 
 # Tokenize conversation and documents using a RAG template, returning PyTorch tensors.
 input_ids = tokenizer.apply_chat_template(
-    conversation=conversation,
-    documents=documents,
-    chat_template="rag",
-    tokenize=True,
-    add_generation_prompt=True,
-    return_tensors="pt").to(device)
-
-# Generate a response 
+conversation=conversation,
+documents=documents,
+chat_template="rag",
+tokenize=True,
+add_generation_prompt=True,
+return_tensors="pt").to(device)
+
+# Generate a response
 gen_tokens = model.generate(
-    input_ids,
-    max_new_tokens=100,
-    do_sample=True,
-    temperature=0.3,
-    )
+input_ids,
+max_new_tokens=100,
+do_sample=True,
+temperature=0.3,
+)
 
 # Decode and print the generated text along with generation prompt
 gen_text = tokenizer.decode(gen_tokens[0])
@@ -683,11 +683,11 @@ one is a little simplified from the actual one!
 
 ```
 {%- for message in messages %}
-    {{- '<|' + message['role'] + |>\n' }}
-    {{- message['content'] + eos_token }}
+{{- '<|' + message['role'] + |>\n' }}
+{{- message['content'] + eos_token }}
 {%- endfor %}
 {%- if add_generation_prompt %}
-    {{- '<|assistant|>\n' }}
+{{- '<|assistant|>\n' }}
 {%- endif %}
 ```
 
@@ -697,10 +697,10 @@ syntax resembles Python. In pure Python, this template would look something like
 
 ```python
 for message in messages:
-    print(f'<|{message["role"]}|>')
-    print(message['content'] + eos_token)
+print(f'<|{message["role"]}|>')
+print(message['content'] + eos_token)
 if add_generation_prompt:
-    print('<|assistant|>')
+print('<|assistant|>')
 ```
 
 Effectively, the template does three things:
@@ -716,13 +716,13 @@ in your actual code!)
 
 ```
 {%- for message in messages %}
-    {%- if message['role'] == 'user' %}
-        {{- bos_token + '[INST] ' + message['content'] + ' [/INST]' }}
-    {%- elif message['role'] == 'system' %}
-        {{- '<<SYS>>\\n' + message['content'] + '\\n<</SYS>>\\n\\n' }}
-    {%- elif message['role'] == 'assistant' %}
-        {{- ' '  + message['content'] + ' ' + eos_token }}
-    {%- endif %}
+{%- if message['role'] == 'user' %}
+{{- bos_token + '[INST] ' + message['content'] + ' [/INST]' }}
+{%- elif message['role'] == 'system' %}
+{{- '<<SYS>>\\n' + message['content'] + '\\n<</SYS>>\\n\\n' }}
+{%- elif message['role'] == 'assistant' %}
+{{- ' '  + message['content'] + ' ' + eos_token }}
+{%- endif %}
 {%- endfor %}
 ```
 
@@ -740,13 +740,13 @@ above and add "[ASST]" and "[/ASST]" to assistant messages:
 
 ```
 {%- for message in messages %}
-    {%- if message['role'] == 'user' %}
-        {{- bos_token + '[INST] ' + message['content'].strip() + ' [/INST]' }}
-    {%- elif message['role'] == 'system' %}
-        {{- '<<SYS>>\\n' + message['content'].strip() + '\\n<</SYS>>\\n\\n' }}
-    {%- elif message['role'] == 'assistant' %}
-        {{- '[ASST] '  + message['content'] + ' [/ASST]' + eos_token }}
-    {%- endif %}
+{%- if message['role'] == 'user' %}
+{{- bos_token + '[INST] ' + message['content'].strip() + ' [/INST]' }}
+{%- elif message['role'] == 'system' %}
+{{- '<<SYS>>\\n' + message['content'].strip() + '\\n<</SYS>>\\n\\n' }}
+{%- elif message['role'] == 'assistant' %}
+{{- '[ASST] '  + message['content'] + ' [/ASST]' + eos_token }}
+{%- endif %}
 {%- endfor %}
 ```
 
@@ -807,7 +807,7 @@ It looks like this:
 
 ```
 {%- for message in messages %}
-    {{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}
+{{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}
 {%- endfor %}
 ```
 
@@ -879,7 +879,7 @@ your templates like this:
 
 ```
 {%- for message in messages %}
-    {{- message['role'] + message['content'] }}
+{{- message['role'] + message['content'] }}
 {%- endfor %}
 ```
 
@@ -887,7 +887,7 @@ rather than like this:
 
 ```
 {% for message in messages %}
-    {{ message['role'] + message['content'] }}
+{{ message['role'] + message['content'] }}
 {% endfor %}
 ```
 
@@ -954,10 +954,10 @@ Here is an example of a template that formats messages ChatML-style, with genera
 ```text
 {{- bos_token }}
 {%- for message in messages %}
-    {{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}
+{{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}
 {%- endfor %}
 {%- if add_generation_prompt %}
-    {{- '<|im_start|>assistant\n' }}
+{{- '<|im_start|>assistant\n' }}
 {%- endif %}
 ```
 
@@ -1013,25 +1013,25 @@ a sample tool JSON schema:
 
 ```json
 {
-  "type": "function", 
-  "function": {
-    "name": "multiply", 
-    "description": "A function that multiplies two numbers", 
-    "parameters": {
-      "type": "object", 
-      "properties": {
-        "a": {
-          "type": "number", 
-          "description": "The first number to multiply"
-        }, 
-        "b": {
-          "type": "number",
-          "description": "The second number to multiply"
-        }
-      }, 
-      "required": ["a", "b"]
-    }
-  }
+"type": "function",
+"function": {
+"name": "multiply",
+"description": "A function that multiplies two numbers",
+"parameters": {
+"type": "object",
+"properties": {
+"a": {
+"type": "number",
+"description": "The first number to multiply"
+},
+"b": {
+"type": "number",
+"description": "The second number to multiply"
+}
+},
+"required": ["a", "b"]
+}
+}
 }
 ```
 
@@ -1040,13 +1040,13 @@ specific format - your model will probably need different formatting!
 
 ```text
 {%- if tools %}
-    {%- for tool in tools %}
-        {{- '<tool>' + tool['function']['name'] + '\n' }}
-        {%- for argument in tool['function']['parameters']['properties'] %}
-            {{- argument + ': ' + tool['function']['parameters']['properties'][argument]['description'] + '\n' }}
-        {%- endfor %}
-        {{- '\n</tool>' }}
-    {%- endif %}
+{%- for tool in tools %}
+{{- '<tool>' + tool['function']['name'] + '\n' }}
+{%- for argument in tool['function']['parameters']['properties'] %}
+{{- argument + ': ' + tool['function']['parameters']['properties'][argument]['description'] + '\n' }}
+{%- endfor %}
+{{- '\n</tool>' }}
+{%- endif %}
 {%- endif %}
 ```
 
@@ -1064,19 +1064,19 @@ the list will usually only have a single element. Here is a sample message dict
 
 ```json
 {
-  "role": "assistant",
-  "tool_calls": [
-    {
-      "type": "function",
-      "function": {
-        "name": "multiply",
-        "arguments": {
-          "a": 5,
-          "b": 6
-        }
-      }
-    }
-  ]
+"role": "assistant",
+"tool_calls": [
+{
+"type": "function",
+"function": {
+"name": "multiply",
+"arguments": {
+"a": 5,
+"b": 6
+}
+}
+}
+]
 }
 ```
 
@@ -1084,10 +1084,10 @@ And a common pattern for handling them would be something like this:
 
 ```text
 {%- if message['role'] == 'assistant' and 'tool_calls' in message %}
-    {%- for tool_call in message['tool_calls'] %}
-            {{- '<tool_call>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments']|tojson + '\n</tool_call>' }}
-        {%- endif %}
-    {%- endfor %}
+{%- for tool_call in message['tool_calls'] %}
+{{- '<tool_call>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments']|tojson + '\n</tool_call>' }}
+{%- endif %}
+{%- endfor %}
 {%- endif %}
 ```
 
@@ -1100,9 +1100,9 @@ of the called function, and a "content" key containing the result of the tool ca
 
 ```json
 {
-  "role": "tool",
-  "name": "multiply",
-  "content": "30"
+"role": "tool",
+"name": "multiply",
+"content": "30"
 }
 ```
 
@@ -1111,7 +1111,7 @@ name to be included in the tool response, then rendering it can be as simple as:
 
 ```text
 {%- if message['role'] == 'tool' %}
-    {{- "<tool_result>" + message['content'] + "</tool_result>" }}
+{{- "<tool_result>" + message['content'] + "</tool_result>" }}
 {%- endif %}
 ```
 
diff --git a/docs/source/en/conversations.md b/docs/source/en/conversations.md
index a48c046b4949..80d4b81e8614 100644
--- a/docs/source/en/conversations.md
+++ b/docs/source/en/conversations.md
@@ -39,8 +39,8 @@ by adding its response. Let's see this in action. First, let's build a chat:
 
 ```python
 chat = [
-    {"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."},
-    {"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"}
+{"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."},
+{"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"}
 ]
 ```
 
@@ -69,19 +69,19 @@ print(response[0]['generated_text'][-1]['content'])
 And you'll get:
 
 ```text
-(sigh) Oh boy, you're asking me for advice? You're gonna need a map, pal! Alright, 
+(sigh) Oh boy, you're asking me for advice? You're gonna need a map, pal! Alright,
 alright, I'll give you the lowdown. But don't say I didn't warn you, I'm a robot, not a tour guide!
 
-So, you wanna know what's fun to do in the Big Apple? Well, let me tell you, there's a million 
-things to do, but I'll give you the highlights. First off, you gotta see the sights: the Statue of 
-Liberty, Central Park, Times Square... you know, the usual tourist traps. But if you're lookin' for 
-something a little more... unusual, I'd recommend checkin' out the Museum of Modern Art. It's got 
+So, you wanna know what's fun to do in the Big Apple? Well, let me tell you, there's a million
+things to do, but I'll give you the highlights. First off, you gotta see the sights: the Statue of
+Liberty, Central Park, Times Square... you know, the usual tourist traps. But if you're lookin' for
+something a little more... unusual, I'd recommend checkin' out the Museum of Modern Art. It's got
 some wild stuff, like that Warhol guy's soup cans and all that jazz.
 
-And if you're feelin' adventurous, take a walk across the Brooklyn Bridge. Just watch out for 
+And if you're feelin' adventurous, take a walk across the Brooklyn Bridge. Just watch out for
 those pesky pigeons, they're like little feathered thieves! (laughs) Get it? Thieves? Ah, never mind.
 
-Now, if you're lookin' for some serious fun, hit up the comedy clubs in Greenwich Village. You might 
+Now, if you're lookin' for some serious fun, hit up the comedy clubs in Greenwich Village. You might
 even catch a glimpse of some up-and-coming comedians... or a bunch of wannabes tryin' to make it big. (winks)
 
 And finally, if you're feelin' like a real New Yorker, grab a slice of pizza from one of the many amazing
@@ -98,7 +98,23 @@ a message and pass it back:
 ```python
 chat = response[0]['generated_text']
 chat.append(
-    {"role": "user", "content": "Wait, what's so wild about soup cans?"}
+{"role": "user", "content": "Wait, what's so wild about soup cans?"}
+)
+response = pipe(chat, max_new_tokens=512)
+print(response[0]['generated_text'][-1]['content'])
+```
+So, there you have it, pal! That's my expert advice on what to do in New York. Now, if you'll
+excuse me, I've got some oil changes to attend to. (winks)
+```
+
+You can continue the chat by appending your own response to it. The
+`response` object returned by the pipeline actually contains the entire chat so far, so we can simply append
+a message and pass it back:
+
+```python
+chat = response[0]['generated_text']
+chat.append(
+{"role": "user", "content": "Wait, what's so wild about soup cans?"}
 )
 response = pipe(chat, max_new_tokens=512)
 print(response[0]['generated_text'][-1]['content'])
@@ -107,9 +123,9 @@ print(response[0]['generated_text'][-1]['content'])
 And you'll get:
 
 ```text
-(laughs) Oh, you're killin' me, pal! You don't get it, do you? Warhol's soup cans are like, art, man! 
-It's like, he took something totally mundane, like a can of soup, and turned it into a masterpiece. It's 
-like, "Hey, look at me, I'm a can of soup, but I'm also a work of art!" 
+(laughs) Oh, you're killin' me, pal! You don't get it, do you? Warhol's soup cans are like, art, man!
+It's like, he took something totally mundane, like a can of soup, and turned it into a masterpiece. It's
+like, "Hey, look at me, I'm a can of soup, but I'm also a work of art!"
 (sarcastically) Oh, yeah, real original, Andy.
 
 But, you know, back in the '60s, it was like, a big deal. People were all about challenging the
@@ -122,7 +138,6 @@ But, hey, that's what makes art, art, right? (laughs)
 
 The remainder of this tutorial will cover specific topics such
 as performance and memory, or how to select a chat model for your needs.
-
 ## Choosing a chat model
 
 There are an enormous number of different chat models available on the [Hugging Face Hub](https://huggingface.co/models?pipeline_tag=text-generation&sort=trending),
@@ -176,8 +191,8 @@ import torch
 
 # Prepare the input as before
 chat = [
-    {"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."},
-    {"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"}
+{"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."},
+{"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"}
 ]
 
 # 1: Load the model and tokenizer
@@ -212,6 +227,7 @@ the broad ideas, and leave the details for the linked documents. The key steps a
 4. We [generate](https://huggingface.co/docs/transformers/en/llm_tutorial) a response from the model.
 5. The tokens output by the model are decoded back to a string
 
+Additionally, with the introduction of the `ImageTextToTextPipeline`, you can now handle multi-modal inputs, such as combining images and text to generate responses. This expands the capabilities of the pipeline to include tasks like visual question answering and image-based text generation.
 ## Performance, memory and hardware
 
 You probably know by now that most machine learning tasks are run on GPUs. However, it is entirely possible
diff --git a/docs/source/en/model_doc/idefics2.md b/docs/source/en/model_doc/idefics2.md
index 5ad56b7b5c52..4de9d532bf61 100644
--- a/docs/source/en/model_doc/idefics2.md
+++ b/docs/source/en/model_doc/idefics2.md
@@ -44,6 +44,7 @@ The original code can be found [here](https://huggingface.co/HuggingFaceM4/idefi
 - The processor has a `do_image_splitting` option. If `True`, each input image will be split into 4 sub-images, and concatenated with the original to form 5 images. This is useful for increasing model performance. Make sure `processor.image_processor.do_image_splitting` is set to `False` if the model was not trained with this option.
 - `text` passed to the processor should have the `<image>` tokens where the images should be inserted. And `<end_of_utterance>` at the end of each utterance if the text is a chat message.
 - The processor has its own `apply_chat_template` method to convert chat messages to text that can then be passed as `text` to the processor.
+- The `post_process_image_text_to_text` method is available for decoding the text output from the model's generated sequences.
 
 Example of how to use the processor on chat messages:
 
@@ -63,12 +64,12 @@ image_2 = Image.open(requests.get(url_2, stream=True).raw)
 images = [image_1, image_2]
 
 messages = [{
-    "role": "user",
-    "content": [
-        {"type": "text", "text": "What’s the difference between these two images?"},
-        {"type": "image"},
-        {"type": "image"},
-    ],
+"role": "user",
+"content": [
+{"type": "text", "text": "What’s the difference between these two images?"},
+{"type": "image"},
+{"type": "image"},
+],
 }]
 
 processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b")
@@ -83,7 +84,7 @@ print(text)
 inputs = processor(images=images, text=text, return_tensors="pt").to(device)
 
 generated_text = model.generate(**inputs, max_new_tokens=500)
-generated_text = processor.batch_decode(generated_text, skip_special_tokens=True)[0]
+generated_text = processor.post_process_image_text_to_text(generated_text)
 print("Generated text:", generated_text)
 ```
 
@@ -103,18 +104,18 @@ image_2 = Image.open(requests.get(url_2, stream=True).raw)
 images = [image_1, image_2]
 
 messages = [{
-    "role": "user",
-    "content": [
-        {"type": "text", "text": "What’s the difference between these two images?"},
-        {"type": "image"},
-        {"type": "image"},
-    ],
+"role": "user",
+"content": [
+{"type": "text", "text": "What’s the difference between these two images?"},
+{"type": "image"},
+{"type": "image"},
+],
 },
 {
-    "role": "assistant",
-    "content": [
-        {"type": "text", "text": "The difference is that one image is about dogs and the other one about cats."},
-    ],
+"role": "assistant",
+"content": [
+{"type": "text", "text": "The difference is that one image is about dogs and the other one about cats."},
+],
 }]
 
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -130,6 +131,21 @@ labels = inputs.input_ids.clone()
 labels[labels == processor.tokenizer.pad_token_id] = -100
 labels[labels == model.config.image_token_id] = -100
 
+inputs["labels"] = labels
+```
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b")
+model = Idefics2ForConditionalGeneration.from_pretrained("HuggingFaceM4/idefics2-8b")
+model.to(device)
+
+text = processor.apply_chat_template(messages, add_generation_prompt=False)
+inputs = processor(images=images, text=text, return_tensors="pt").to(device)
+
+labels = inputs.input_ids.clone()
+labels[labels == processor.tokenizer.pad_token_id] = -100
+labels[labels == model.config.image_token_id] = -100
+
 inputs["labels"] = labels
 
 outputs = model(**inputs)
@@ -138,7 +154,6 @@ loss.backward()
 ```
 
 Do note that when training Idefics2 on multi-turn conversations between a user and an assistant, one typically also sets all the tokens corresponding to the user messages to -100.
-
 ## Model optimizations: Flash Attention
 
 The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model.
@@ -155,8 +170,8 @@ To load and run a model using Flash Attention-2, simply change the code snippet
 
 ```diff
 model = Idefics2ForConditionalGeneration.from_pretrained(
-    "HuggingFaceM4/idefics2-8b",
-+    torch_dtype=torch.float16,    
+"HuggingFaceM4/idefics2-8b",
++    torch_dtype=torch.float16,
 +    attn_implementation="flash_attention_2",
 ).to(device)
 ```
@@ -177,8 +192,8 @@ Quantizing a model is as simple as passing a `quantization_config` to the model.
 +    bnb_4bit_compute_dtype=torch.float16
 + )
 model = Idefics2ForConditionalGeneration.from_pretrained(
-    "HuggingFaceM4/idefics2-8b",
-+    torch_dtype=torch.float16,    
+"HuggingFaceM4/idefics2-8b",
++    torch_dtype=torch.float16,
 +    quantization_config=quantization_config,
 ).to(device)
 ```
diff --git a/docs/source/en/model_doc/llava_next.md b/docs/source/en/model_doc/llava_next.md
index b9146fbd3347..dfb72e63cade 100644
--- a/docs/source/en/model_doc/llava_next.md
+++ b/docs/source/en/model_doc/llava_next.md
@@ -63,24 +63,24 @@ from transformers import LlavaNextProcessor
 processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
 
 conversation = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "image"},
-            {"type": "text", "text": "What’s shown in this image?"},
-        ],
-    },
-    {
-        "role": "assistant",
-        "content": [{"type": "text", "text": "This image shows a red stop sign."},]
-    },
-    {
-
-        "role": "user",
-        "content": [
-            {"type": "text", "text": "Describe the image in more details."},
-        ],
-    },
+{
+"role": "user",
+"content": [
+{"type": "image"},
+{"type": "text", "text": "What’s shown in this image?"},
+],
+},
+{
+"role": "assistant",
+"content": [{"type": "text", "text": "This image shows a red stop sign."},]
+},
+{
+
+"role": "user",
+"content": [
+{"type": "text", "text": "Describe the image in more details."},
+],
+},
 ]
 
 text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
@@ -107,6 +107,12 @@ print(text_prompt)
 "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n"
 ```
 
+[llama3-llava-next-8b-hf](https://huggingface.co/llava-hf/llava-next-8b-hf) requires the following format:
+[llava-v1.6-34b-hf](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) requires the following format:
+```bash
+"<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n"
+```
+
 [llama3-llava-next-8b-hf](https://huggingface.co/llava-hf/llava-next-8b-hf) requires the following format:
 
 ```bash
@@ -118,7 +124,6 @@ print(text_prompt)
 ```bash
 "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|>\n<|im_start|>assistant\n"
 ```
-
 ## Usage example
 
 ### Single image inference
@@ -141,13 +146,13 @@ url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247a
 image = Image.open(requests.get(url, stream=True).raw)
 
 conversation = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "image"},
-            {"type": "text", "text": "What is shown in this image?"},
-        ],
-    },
+{
+"role": "user",
+"content": [
+{"type": "image"},
+{"type": "text", "text": "What is shown in this image?"},
+],
+},
 ]
 prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
 inputs = processor(image, prompt, return_tensors="pt").to("cuda:0")
@@ -184,36 +189,36 @@ image_snowman = Image.open(requests.get(url, stream=True).raw)
 
 # Prepare a batch of two prompts, where the first one is a multi-turn conversation and the second is not
 conversation_1 = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "image"},
-            {"type": "text", "text": "What is shown in this image?"},
-            ],
-    },
-    {
-        "role": "assistant",
-        "content": [
-            {"type": "text", "text": "There is a red stop sign in the image."},
-            ],
-    },
-    {
-        "role": "user",
-        "content": [
-            {"type": "image"},
-            {"type": "text", "text": "What about this image? How many cats do you see?"},
-            ],
-    },
+{
+"role": "user",
+"content": [
+{"type": "image"},
+{"type": "text", "text": "What is shown in this image?"},
+],
+},
+{
+"role": "assistant",
+"content": [
+{"type": "text", "text": "There is a red stop sign in the image."},
+],
+},
+{
+"role": "user",
+"content": [
+{"type": "image"},
+{"type": "text", "text": "What about this image? How many cats do you see?"},
+],
+},
 ]
 
 conversation_2 = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "image"},
-            {"type": "text", "text": "What is shown in this image?"},
-            ],
-    },
+{
+"role": "user",
+"content": [
+{"type": "image"},
+{"type": "text", "text": "What is shown in this image?"},
+],
+},
 ]
 
 prompt_1 = processor.apply_chat_template(conversation_1, add_generation_prompt=True)
@@ -228,7 +233,6 @@ inputs = processor(images=[image_stop, image_cats, image_snowman], text=prompts,
 generate_ids = model.generate(**inputs, max_new_tokens=30)
 processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
 ```
-
 ## Model optimization
 
 ### Quantization using Bitsandbytes
@@ -250,9 +254,9 @@ from transformers import AutoModelForImageTextToText, BitsAndBytesConfig
 
 # specify how to quantize the model
 quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_compute_dtype=torch.float16,
+load_in_4bit=True,
+bnb_4bit_quant_type="nf4",
+bnb_4bit_compute_dtype=torch.float16,
 )
 
 model = AutoModelForImageTextToText.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", quantization_config=quantization_config, device_map="auto")
@@ -266,10 +270,10 @@ First make sure to install flash-attn. Refer to the [original repository of Flas
 from transformers import AutoModelForImageTextToText
 
 model = AutoModelForImageTextToText.from_pretrained(
-    model_id,
-    torch_dtype=torch.float16,
-    low_cpu_mem_usage=True,
-    use_flash_attention_2=True
+model_id,
+torch_dtype=torch.float16,
+low_cpu_mem_usage=True,
+use_flash_attention_2=True
 ).to(0)
 ```
 
diff --git a/docs/source/en/model_doc/mllama.md b/docs/source/en/model_doc/mllama.md
index 4a6080ea2ce0..d0e959369e1d 100644
--- a/docs/source/en/model_doc/mllama.md
+++ b/docs/source/en/model_doc/mllama.md
@@ -30,25 +30,6 @@ The Llama 3.2-Vision collection of multimodal large language models (LLMs) is a
 - The text passed to the processor should have the `"<|image|>"` tokens where the images should be inserted.
 - The processor has its own `apply_chat_template` method to convert chat messages to text that can then be passed as text to the processor.
 
-
-<Tip warning={true}>
-
-Mllama has an extra token used as a placeholder for image positions in the text. It means that input ids and an input embedding layer will have an extra token. But since the weights for input and output embeddings are not tied, the `lm_head` layer has one less token and will fail if you want to calculate loss on image tokens or apply some logit processors. In case you are training, make sure to mask out special `"<|image|>"` tokens in the `labels` as the model should not be trained on predicting them.
-
-Otherwise if you see CUDA-side index erros when generating, use the below code to expand the `lm_head` by one more token. 
-
-
-```python
-old_embeddings = model.get_output_embeddings()
-
-num_tokens = model.vocab_size + 1
-resized_embeddings = model._get_resized_lm_head(old_embeddings, new_num_tokens=num_tokens, mean_resizing=True)
-resized_embeddings.requires_grad_(old_embeddings.weight.requires_grad)
-model.set_output_embeddings(resized_embeddings)
-```
-</Tip>
-
-
 ## Usage Example
 
 #### Instruct model
@@ -63,15 +44,15 @@ model = MllamaForConditionalGeneration.from_pretrained(model_id, device_map="aut
 processor = AutoProcessor.from_pretrained(model_id)
 
 messages = [
-    [
-        {
-            "role": "user", 
-            "content": [
-                {"type": "image"},
-                {"type": "text", "text": "What does the image show?"}
-            ]
-        }
-    ],
+[
+{
+"role": "user",
+"content": [
+{"type": "image"},
+{"type": "text", "text": "What does the image show?"}
+]
+}
+],
 ]
 text = processor.apply_chat_template(messages, add_generation_prompt=True)
 
@@ -103,7 +84,6 @@ output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
 print(processor.decode(output[0], skip_special_tokens=True))
 ```
 
-
 ## MllamaConfig
 
 [[autodoc]] MllamaConfig
@@ -112,7 +92,6 @@ print(processor.decode(output[0], skip_special_tokens=True))
 
 [[autodoc]] MllamaProcessor
 
-
 ## MllamaImageProcessor
 
 [[autodoc]] MllamaImageProcessor
@@ -141,3 +120,8 @@ print(processor.decode(output[0], skip_special_tokens=True))
 
 [[autodoc]] MllamaVisionModel
     - forward
+
+## MllamaForImageTextToText
+
+[[autodoc]] MllamaForImageTextToText
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_summary.md b/docs/source/en/model_summary.md
index c7efc4c00d9b..7c70f9e223ed 100644
--- a/docs/source/en/model_summary.md
+++ b/docs/source/en/model_summary.md
@@ -98,6 +98,9 @@ After GPT-2, language models grew even bigger and are now known as *large langua
 
 Optical character recognition (OCR) is a long-standing text recognition task that typically involves several components to understand the image and generate the text. [TrOCR](model_doc/trocr) simplifies the process using an end-to-end Transformer. The encoder is a ViT-style model for image understanding and processes the image as fixed-size patches. The decoder accepts the encoder's hidden states and autoregressively generates text. [Donut](model_doc/donut) is a more general visual document understanding model that doesn't rely on OCR-based approaches. It uses a Swin Transformer as the encoder and multilingual BART as the decoder. Donut is pretrained to read text by predicting the next word based on the image and text annotations. The decoder generates a token sequence given a prompt. The prompt is represented by a special token for each downstream task. For example, document parsing has a special `parsing` token that is combined with the encoder hidden states to parse the document into a structured output format (JSON).
 
+### ImageTextToTextPipeline
+
+The `ImageTextToTextPipeline` is a new addition to the multimodal capabilities of the Transformers library. It allows for generating text from both image and text inputs, enhancing tasks such as visual question answering, image captioning, or image-based text generation. This pipeline supports both single and batch processing and can operate in chat mode, making it versatile for various applications.
 ## Reinforcement learning
 
 <iframe style="border: 1px solid rgba(0, 0, 0, 0.1);" width="1000" height="450" src="https://www.figma.com/embed?embed_host=share&url=https%3A%2F%2Fwww.figma.com%2Ffile%2FiB3Y6RvWYki7ZuKO6tNgZq%2Freinforcement-learning%3Fnode-id%3D0%253A1%26t%3DhPQwdx3HFPWJWnVf-1" allowfullscreen></iframe>
diff --git a/docs/source/en/pipeline_tutorial.md b/docs/source/en/pipeline_tutorial.md
index 3363c68ea417..9c32cf8acb41 100644
--- a/docs/source/en/pipeline_tutorial.md
+++ b/docs/source/en/pipeline_tutorial.md
@@ -77,10 +77,10 @@ If you have several inputs, you can pass your input as a list:
 
 ```py
 transcriber(
-    [
-        "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac",
-        "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac",
-    ]
+[
+"https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac",
+"https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac",
+]
 )
 ```
 
@@ -183,14 +183,14 @@ The pipeline can also run inference on a large dataset. The easiest way we recom
 
 ```py
 def data():
-    for i in range(1000):
-        yield f"My example {i}"
+for i in range(1000):
+yield f"My example {i}"
 
 
 pipe = pipeline(model="openai-community/gpt2", device=0)
 generated_characters = 0
 for out in pipe(data()):
-    generated_characters += len(out[0]["generated_text"])
+generated_characters += len(out[0]["generated_text"])
 ```
 
 The iterator `data()` yields each result, and the pipeline automatically
@@ -212,7 +212,7 @@ pipe = pipeline(model="hf-internal-testing/tiny-random-wav2vec2", device=0)
 dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:10]")
 
 for out in pipe(KeyDataset(dataset, "audio")):
-    print(out)
+print(out)
 ```
 
 
@@ -292,6 +292,59 @@ pip install pytesseract
 
 </Tip>
 
+### ImageTextToTextPipeline
+
+The `ImageTextToTextPipeline` is a new addition to the multimodal pipelines, allowing for the generation of text given both an image and text input. This is particularly useful for tasks such as image captioning or generating descriptions based on visual content. The pipeline can handle both single and batch processing and supports chat mode for conversational models.
+
+Example usage:
+
+```python
+>>> from transformers import pipeline
+
+>>> pipe = pipeline(task="image-text-to-text", model="Salesforce/blip-image-captioning-base")
+>>> pipe("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", text="A photo of")
+[{'generated_text': 'a photo of two birds'}]
+```
+
+For chat-based models:
+
+```python
+>>> from transformers import pipeline
+
+>>> pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+>>> messages = [
+>>>     {
+>>>         "role": "user",
+>>>         "content": [
+>>>             {
+>>>                 "type": "image",
+>>>                 "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+>>>             },
+>>>             {"type": "text", "text": "Describe this image."},
+>>>         ],
+>>>     },
+>>>     {
+>>>         "role": "assistant",
+>>>         "content": [
+>>>             {"type": "text", "text": "There is a dog and"},
+>>>         ],
+>>>     },
+>>> ]
+>>> pipe(text=messages, max_new_tokens=20, return_full_text=False)
+[{'input_text': [{'role': 'user',
+    'content': [{'type': 'image',
+    'url': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'},
+    {'type': 'text', 'text': 'Describe this image.'}]},
+{'role': 'assistant',
+    'content': [{'type': 'text', 'text': 'There is a dog and'}]}],
+'generated_text': ' a person in the image. The dog is sitting on the sand, and the person is sitting on'}]
+```
+
+Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial).
+
+This image-text to text pipeline can currently be loaded from `pipeline()` using the following task identifier: "image-text-to-text".
+
+See the list of available models on [huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-text-to-text).
 ## Using `pipeline` on large models with 🤗 `accelerate`:
 
 You can easily run `pipeline` on large models using 🤗 `accelerate`! First make sure you have installed `accelerate` with `pip install accelerate`. 
diff --git a/docs/source/en/tasks/image_text_to_text.md b/docs/source/en/tasks/image_text_to_text.md
index 261abf947290..8a79a95c5d08 100644
--- a/docs/source/en/tasks/image_text_to_text.md
+++ b/docs/source/en/tasks/image_text_to_text.md
@@ -43,9 +43,9 @@ import torch
 
 device = torch.device("cuda")
 model = AutoModelForImageTextToText.from_pretrained(
-    "HuggingFaceM4/idefics2-8b",
-    torch_dtype=torch.bfloat16,
-    attn_implementation="flash_attention_2",
+"HuggingFaceM4/idefics2-8b",
+torch_dtype=torch.bfloat16,
+attn_implementation="flash_attention_2",
 ).to(device)
 
 processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b")
@@ -69,9 +69,9 @@ from PIL import Image
 import requests
 
 img_urls =["https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png",
-           "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"]
+"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"]
 images = [Image.open(requests.get(img_urls[0], stream=True).raw),
-          Image.open(requests.get(img_urls[1], stream=True).raw)]
+Image.open(requests.get(img_urls[1], stream=True).raw)]
 ```
 
 Below is an example of the chat template. We can feed conversation turns and the last message as an input by appending it at the end of the template.
@@ -79,26 +79,26 @@ Below is an example of the chat template. We can feed conversation turns and the
 
 ```python
 messages = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "image"},
-            {"type": "text", "text": "What do we see in this image?"},
-        ]
-    },
-    {
-        "role": "assistant",
-        "content": [
-            {"type": "text", "text": "In this image we can see two cats on the nets."},
-        ]
-    },
-    {
-        "role": "user",
-        "content": [
-            {"type": "image"},
-            {"type": "text", "text": "And how about this image?"},
-        ]
-    },
+{
+"role": "user",
+"content": [
+{"type": "image"},
+{"type": "text", "text": "What do we see in this image?"},
+]
+},
+{
+"role": "assistant",
+"content": [
+{"type": "text", "text": "In this image we can see two cats on the nets."},
+]
+},
+{
+"role": "user",
+"content": [
+{"type": "image"},
+{"type": "text", "text": "And how about this image?"},
+]
+},
 ]
 ```
 
@@ -113,20 +113,35 @@ We can now pass the preprocessed inputs to the model.
 
 ```python
 with torch.no_grad():
-    generated_ids = model.generate(**inputs, max_new_tokens=500)
+generated_ids = model.generate(**inputs, max_new_tokens=500)
 generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
 
 print(generated_texts)
 ## ['User: What do we see in this image? \nAssistant: In this image we can see two cats on the nets. \nUser: And how about this image? \nAssistant: In this image we can see flowers, plants and insect.']
 ```
 
+To use the new `ImageTextToTextPipeline` for generating text from image and text inputs, you can initialize the pipeline as follows:
+
+```python
+from transformers import pipeline
+
+pipe = pipeline("image-text-to-text", model="HuggingFaceM4/idefics2-8b")
+```
+
+You can then use the pipeline to generate text by providing image URLs or PIL images along with text prompts:
+
+```python
+outputs = pipe(images=img_urls, text="What do we see in these images?")
+print(outputs)
+```
+
+This will leverage the new pipeline to handle multimodal tasks, offering greater flexibility in text generation.
 ## Streaming
 
 We can use [text streaming](./generation_strategies#streaming) for a better generation experience. Transformers supports streaming with the [`TextStreamer`] or [`TextIteratorStreamer`] classes. We will use the [`TextIteratorStreamer`] with IDEFICS-8B.
 
 Assume we have an application that keeps chat history and takes in the new user input. We will preprocess the inputs as usual and initialize [`TextIteratorStreamer`] to handle the generation in a separate thread. This allows you to stream the generated text tokens in real-time. Any generation arguments can be passed to [`TextIteratorStreamer`].
 
-
 ```python
 import time
 from transformers import TextIteratorStreamer
@@ -179,7 +194,7 @@ def model_inference(
         acc_text += text_token
         if acc_text.endswith("<end_of_utterance>"):
             acc_text = acc_text[:-18]
-        yield acc_text
+            yield acc_text
 
     thread.join()
 ```
@@ -195,13 +210,12 @@ generator = model_inference(
 )
 
 for value in generator:
-  print(value)
+    print(value)
 
 # In
 # In this
 # In this image ...
 ```
-
 ## Fit models in smaller hardware
 
 VLMs are often large and need to be optimized to fit on smaller hardware. Transformers supports many model quantization libraries, and here we will only show int8 quantization with [Quanto](./quantization/quanto#quanto). int8 quantization offers memory improvements up to 75 percent (if all weights are quantized). However it is no free lunch, since 8-bit is not a CUDA-native precision, the weights are quantized back and forth on the fly, which adds up to latency.
@@ -220,7 +234,7 @@ from transformers import AutoModelForImageTextToText, QuantoConfig
 model_id = "HuggingFaceM4/idefics2-8b"
 quantization_config = QuantoConfig(weights="int8")
 quantized_model = AutoModelForImageTextToText.from_pretrained(
-    model_id, device_map="cuda", quantization_config=quantization_config
+model_id, device_map="cuda", quantization_config=quantization_config
 )
 ```
 
diff --git a/docs/source/en/tasks/video_text_to_text.md b/docs/source/en/tasks/video_text_to_text.md
index fcc1c86e8bd7..1e8305a41888 100644
--- a/docs/source/en/tasks/video_text_to_text.md
+++ b/docs/source/en/tasks/video_text_to_text.md
@@ -34,7 +34,7 @@ This guide focuses on inference with an instruction-tuned model, [llava-hf/llava
 Let's begin installing the dependencies.
 
 ```bash
-pip install -q transformers accelerate flash_attn 
+pip install -q transformers accelerate flash_attn
 ```
 
 Let's initialize the model and the processor. 
@@ -58,17 +58,16 @@ import requests
 import cv2
 
 def replace_video_with_images(text, frames):
-  return text.replace("<video>", "<image>" * frames)
+    return text.replace("<video>", "<image>" * frames)
 
 def sample_frames(url, num_frames):
-
     response = requests.get(url)
     path_id = str(uuid.uuid4())
 
-    path = f"./{path_id}.mp4" 
+    path = f"./{path_id}.mp4"
 
     with open(path, "wb") as f:
-      f.write(response.content)
+        f.write(response.content)
 
     video = cv2.VideoCapture(path)
     total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
@@ -103,6 +102,25 @@ videos
 # <PIL.Image.Image image mode=RGB size=1920x1080>, ...]
 ```
 
+Both videos have cats.
+Let's get our inputs. We will sample frames and concatenate them.
+
+```python
+video_1 = "https://huggingface.co/spaces/merve/llava-interleave/resolve/main/cats_1.mp4"
+video_2 = "https://huggingface.co/spaces/merve/llava-interleave/resolve/main/cats_2.mp4"
+
+video_1 = sample_frames(video_1, 6)
+video_2 = sample_frames(video_2, 6)
+
+videos = video_1 + video_2
+
+videos
+
+# [<PIL.Image.Image image mode=RGB size=1920x1080>,
+# <PIL.Image.Image image mode=RGB size=1920x1080>,
+# <PIL.Image.Image image mode=RGB size=1920x1080>, ...]
+```
+
 Both videos have cats.
 
 <div class="container">
diff --git a/docs/source/es/chat_templating.md b/docs/source/es/chat_templating.md
index e287c2137435..5864da22b498 100644
--- a/docs/source/es/chat_templating.md
+++ b/docs/source/es/chat_templating.md
@@ -55,6 +55,7 @@ Observa cómo todo el chat se condensa en una sola cadena. Si usamos `tokenize=T
 
 Ten en cuenta que esta vez, el tokenizador ha añadido los tokens de control [INST] y [/INST] para indicar el inicio y el final de los mensajes de usuario (¡pero no de los mensajes del asistente!). Mistral-instruct fue entrenado con estos tokens, pero BlenderBot no lo fue.
 
+Además, con la introducción de la nueva funcionalidad de `ImageTextToTextPipeline`, ahora es posible integrar imágenes en las conversaciones de chat. Esto permite que los modelos manejen tareas multimodales, como responder preguntas visuales o generar texto basado en imágenes. Para utilizar esta funcionalidad, las imágenes se pueden incluir en el contenido del mensaje como un tipo de entrada adicional, lo que amplía las capacidades de los modelos de chat para interactuar con entradas visuales y textuales de manera integrada.
 ## ¿Cómo uso las plantillas de chat?
 
 Como puedes ver en el ejemplo anterior, las plantillas de chat son fáciles de usar. Simplemente construye una lista de mensajes, con claves de `rol` y `contenido`, y luego pásala al método [`~PreTrainedTokenizer.apply_chat_template`]. Una vez que hagas eso, ¡obtendrás una salida lista para usar! Al utilizar plantillas de chat como entrada para la generación de modelos, también es una buena idea usar `add_generation_prompt=True` para agregar una [indicación de generación](#¿Qué-son-los-"generation-prompts"?).
@@ -69,12 +70,12 @@ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 model = AutoModelForCausalLM.from_pretrained(checkpoint)  # You may want to use bfloat16 and/or move to GPU here
 
 messages = [
-    {
-        "role": "system",
-        "content": "You are a friendly chatbot who always responds in the style of a pirate",
-    },
-    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
- ]
+{
+"role": "system",
+"content": "You are a friendly chatbot who always responds in the style of a pirate",
+},
+{"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
+]
 tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
 print(tokenizer.decode(tokenized_chat[0]))
 ```
@@ -83,16 +84,16 @@ Esto generará una cadena en el formato de entrada que Zephyr espera.
 
 ```text
 <|system|>
-You are a friendly chatbot who always responds in the style of a pirate</s> 
+You are a friendly chatbot who always responds in the style of a pirate</s>
 <|user|>
-How many helicopters can a human eat in one sitting?</s> 
+How many helicopters can a human eat in one sitting?</s>
 <|assistant|>
 ```
 
 Ahora que nuestra entrada está formateada correctamente para Zephyr, podemos usar el modelo para generar una respuesta a la pregunta del usuario:
 
 ```python
-outputs = model.generate(tokenized_chat, max_new_tokens=128) 
+outputs = model.generate(tokenized_chat, max_new_tokens=128)
 print(tokenizer.decode(outputs[0]))
 
 ```
@@ -100,9 +101,9 @@ Esto producirá:
 
 ```text
 <|system|>
-You are a friendly chatbot who always responds in the style of a pirate</s> 
+You are a friendly chatbot who always responds in the style of a pirate</s>
 <|user|>
-How many helicopters can a human eat in one sitting?</s> 
+How many helicopters can a human eat in one sitting?</s>
 <|assistant|>
 Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopters are not food, they are flying machines. Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. But helicopters, they be for transportin' and movin' around, not for eatin'. So, I'd say none, me hearties. None at all.
 ```
@@ -118,11 +119,11 @@ from transformers import pipeline
 
 pipe = pipeline("conversational", "HuggingFaceH4/zephyr-7b-beta")
 messages = [
-    {
-        "role": "system",
-        "content": "You are a friendly chatbot who always responds in the style of a pirate",
-    },
-    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
+{
+"role": "system",
+"content": "You are a friendly chatbot who always responds in the style of a pirate",
+},
+{"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
 ]
 print(pipe(messages, max_new_tokens=128)[0]['generated_text'][-1])  # Print the assistant's response
 ```
@@ -140,9 +141,9 @@ Puede que hayas notado que el método `apply_chat_template` tiene un argumento `
 
 ```python
 messages = [
-    {"role": "user", "content": "Hi there!"},
-    {"role": "assistant", "content": "Nice to meet you!"},
-    {"role": "user", "content": "Can I ask a question?"}
+{"role": "user", "content": "Hi there!"},
+{"role": "assistant", "content": "Nice to meet you!"},
+{"role": "user", "content": "Can I ask a question?"}
 ]
 ```
 
@@ -188,12 +189,12 @@ from datasets import Dataset
 tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
 
 chat1 = [
-    {"role": "user", "content": "Which is bigger, the moon or the sun?"},
-    {"role": "assistant", "content": "The sun."}
+{"role": "user", "content": "Which is bigger, the moon or the sun?"},
+{"role": "assistant", "content": "The sun."}
 ]
 chat2 = [
-    {"role": "user", "content": "Which is bigger, a virus or a bacterium?"},
-    {"role": "assistant", "content": "A bacterium."}
+{"role": "user", "content": "Which is bigger, a virus or a bacterium?"},
+{"role": "assistant", "content": "A bacterium."}
 ]
 
 dataset = Dataset.from_dict({"chat": [chat1, chat2]})
@@ -228,13 +229,13 @@ La plantilla de chat para un modelo se almacena en el atributo `tokenizer.chat_t
 
 ```
 {% for message in messages %}
-    {% if message['role'] == 'user' %}
-        {{ ' ' }}
-    {% endif %}
-    {{ message['content'] }}
-    {% if not loop.last %}
-        {{ '  ' }}
-    {% endif %}
+{% if message['role'] == 'user' %}
+{{ ' ' }}
+{% endif %}
+{{ message['content'] }}
+{% if not loop.last %}
+{{ '  ' }}
+{% endif %}
 {% endfor %}
 {{ eos_token }}
 ```
@@ -243,11 +244,11 @@ Si nunca has visto uno de estos antes, esto es una [plantilla de Jinja](https://
 
 ```python
 for idx, message in enumerate(messages):
-    if message['role'] == 'user':
-        print(' ')
-    print(message['content'])
-    if not idx == len(messages) - 1:  # Check for the last message in the conversation
-        print('  ')
+if message['role'] == 'user':
+print(' ')
+print(message['content'])
+if not idx == len(messages) - 1:  # Check for the last message in the conversation
+print('  ')
 print(eos_token)
 ```
 
@@ -260,13 +261,13 @@ Esta es una plantilla bastante simple: no añade ningún token de control y no a
 
 ```
 {% for message in messages %}
-    {% if message['role'] == 'user' %}
-        {{ bos_token + '[INST] ' + message['content'] + ' [/INST]' }}
-    {% elif message['role'] == 'system' %}
-        {{ '<<SYS>>\\n' + message['content'] + '\\n<</SYS>>\\n\\n' }}
-    {% elif message['role'] == 'assistant' %}
-        {{ ' '  + message['content'] + ' ' + eos_token }}
-    {% endif %}
+{% if message['role'] == 'user' %}
+{{ bos_token + '[INST] ' + message['content'] + ' [/INST]' }}
+{% elif message['role'] == 'system' %}
+{{ '<<SYS>>\\n' + message['content'] + '\\n<</SYS>>\\n\\n' }}
+{% elif message['role'] == 'assistant' %}
+{{ ' '  + message['content'] + ' ' + eos_token }}
+{% endif %}
 {% endfor %}
 ```
 
@@ -280,13 +281,13 @@ Simple, solo escribe una plantilla de Jinja y establece `tokenizer.chat_template
 
 ```
 {% for message in messages %}
-    {% if message['role'] == 'user' %}
-        {{ bos_token + '[INST] ' + message['content'].strip() + ' [/INST]' }}
-    {% elif message['role'] == 'system' %}
-        {{ '<<SYS>>\\n' + message['content'].strip() + '\\n<</SYS>>\\n\\n' }}
-    {% elif message['role'] == 'assistant' %}
-        {{ '[ASST] '  + message['content'] + ' [/ASST]' + eos_token }}
-    {% endif %}
+{% if message['role'] == 'user' %}
+{{ bos_token + '[INST] ' + message['content'].strip() + ' [/INST]' }}
+{% elif message['role'] == 'system' %}
+{{ '<<SYS>>\\n' + message['content'].strip() + '\\n<</SYS>>\\n\\n' }}
+{% elif message['role'] == 'assistant' %}
+{{ '[ASST] '  + message['content'] + ' [/ASST]' + eos_token }}
+{% endif %}
 {% endfor %}
 ```
 
@@ -315,7 +316,7 @@ Si estás entrenando un modelo desde cero o ajustando finamente un modelo de len
 
 ```
 {% for message in messages %}
-    {{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}
+{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}
 {% endfor %}
 ```
 
diff --git a/docs/source/fr/quicktour.md b/docs/source/fr/quicktour.md
index 3cc2a8c5faac..ab388a090063 100644
--- a/docs/source/fr/quicktour.md
+++ b/docs/source/fr/quicktour.md
@@ -64,6 +64,7 @@ Le [`pipeline`] est le moyen le plus simple d'utiliser un modèle pré-entraîn
 | Classification d'audio       | Attribue une catégorie à un fichier audio                                                                    | Audio                | pipeline(task="audio-classification")         |
 | Reconnaissance automatique de la parole | Extrait le discours d'un fichier audio en texte                                                                  | Audio                | pipeline(task="automatic-speech-recognition") |
 | Question réponse visuels    | Etant données une image et une question, répond correctement à une question sur l'image                                   | Modalités multiples  | pipeline(task="vqa")                          |
+| Image-texte à texte          | Génère du texte à partir d'une image et d'un texte donnés                                                    | Modalités multiples  | pipeline(task="image-text-to-text")           |
 
 Commencez par créer une instance de [`pipeline`] et spécifiez la tâche pour laquelle vous souhaitez l'utiliser. Vous pouvez utiliser le [`pipeline`] pour n'importe laquelle des tâches mentionnées dans le tableau précédent. Pour obtenir une liste complète des tâches prises en charge, consultez la documentation de l'[API pipeline](./main_classes/pipelines). Dans ce guide, nous utiliserons le [`pipeline`] pour l'analyse des sentiments à titre d'exemple :
 
@@ -80,6 +81,20 @@ Le [`pipeline`] télécharge et stocke en cache un [modèle pré-entraîné](htt
 [{'label': 'POSITIVE', 'score': 0.9998}]
 ```
 
+Si vous voulez classifier plus qu'un texte, donnez une liste de textes au [`pipeline`] pour obtenir une liste de dictionnaires en retour :
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline("sentiment-analysis")
+```
+
+Le [`pipeline`] télécharge et stocke en cache un [modèle pré-entraîné](https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english) et un tokenizer par défaut pour l'analyse des sentiments. Vous pouvez maintenant utiliser le `classifier` sur le texte de votre choix :
+
+```py
+>>> classifier("We are very happy to show you the 🤗 Transformers library.")
+[{'label': 'POSITIVE', 'score': 0.9998}]
+```
+
 Si vous voulez classifier plus qu'un texte, donnez une liste de textes au [`pipeline`] pour obtenir une liste de dictionnaires en retour :
 
 ```py
@@ -123,7 +138,6 @@ Extrayez les tableaux de formes d'ondes brutes des quatre premiers échantillons
 ```
 
 Pour les ensembles de données plus importants où les entrées sont volumineuses (comme dans les domaines de la parole ou de la vision), utilisez plutôt un générateur au lieu d'une liste pour charger toutes les entrées en mémoire. Pour plus d'informations, consultez la documentation de l'[API pipeline](./main_classes/pipelines).
-
 ### Utiliser une autre modèle et tokenizer dans le pipeline
 
 Le [`pipeline`] peut être utilisé avec n'importe quel modèle du [Hub](https://huggingface.co/models), ce qui permet d'adapter facilement le [`pipeline`] à d'autres cas d'utilisation. Par exemple, si vous souhaitez un modèle capable de traiter du texte français, utilisez les filtres du Hub pour trouver un modèle approprié. Le premier résultat renvoie un [modèle BERT](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment) multilingue finetuné pour l'analyse des sentiments que vous pouvez utiliser pour le texte français :
@@ -192,8 +206,8 @@ Passez votre texte au tokenizer :
 >>> encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")
 >>> print(encoding)
 {'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102],
- 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
- 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
 ```
 
 Le tokenizer retourne un dictionnaire contenant :
@@ -269,7 +283,7 @@ Le modèle produit les activations finales dans l'attribut `logits`. Appliquez l
 >>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
 >>> print(pt_predictions)
 tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
-        [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=<SoftmaxBackward0>)
+[0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=<SoftmaxBackward0>)
 ```
 </pt>
 <tf>
@@ -412,61 +426,61 @@ En fonction de votre tâche, vous passerez généralement les paramètres suivan
 
 1. Un [`PreTrainedModel`] ou un [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module):
 
-   ```py
-   >>> from transformers import AutoModelForSequenceClassification
+```py
+>>> from transformers import AutoModelForSequenceClassification
 
-   >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
+>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
    ```
 
 2. [`TrainingArguments`] contient les hyperparamètres du modèle que vous pouvez changer comme le taux d'apprentissage, la taille de l'échantillon, et le nombre d'époques pour s'entraîner. Les valeurs par défaut sont utilisées si vous ne spécifiez pas d'hyperparamètres d'apprentissage :
 
-   ```py
-   >>> from transformers import TrainingArguments
+```py
+>>> from transformers import TrainingArguments
 
-   >>> training_args = TrainingArguments(
-   ...     output_dir="path/to/save/folder/",
-   ...     learning_rate=2e-5,
-   ...     per_device_train_batch_size=8,
-   ...     per_device_eval_batch_size=8,
-   ...     num_train_epochs=2,
-   ... )
+>>> training_args = TrainingArguments(
+...     output_dir="path/to/save/folder/",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=8,
+...     per_device_eval_batch_size=8,
+...     num_train_epochs=2,
+... )
    ```
 
 3. Une classe de prétraitement comme un tokenizer, un processeur d'images ou un extracteur de caractéristiques :
 
-   ```py
-   >>> from transformers import AutoTokenizer
+```py
+>>> from transformers import AutoTokenizer
 
-   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
    ```
 
 4. Chargez un jeu de données :
 
-   ```py
-   >>> from datasets import load_dataset
+```py
+>>> from datasets import load_dataset
 
-   >>> dataset = load_dataset("rotten_tomatoes")  # doctest: +IGNORE_RESULT
+>>> dataset = load_dataset("rotten_tomatoes")  # doctest: +IGNORE_RESULT
    ```
 
 5. Créez une fonction qui transforme le texte du jeu de données en token :
 
-   ```py
-   >>> def tokenize_dataset(dataset):
-   ...     return tokenizer(dataset["text"])
+```py
+>>> def tokenize_dataset(dataset):
+...     return tokenizer(dataset["text"])
    ```
 
    Puis appliquez-la à l'intégralité du jeu de données avec [`~datasets.Dataset.map`]:
 
-   ```py
-   >>> dataset = dataset.map(tokenize_dataset, batched=True)
+```py
+>>> dataset = dataset.map(tokenize_dataset, batched=True)
    ```
 
 6. Un [`DataCollatorWithPadding`] pour créer un échantillon d'exemples à partir de votre jeu de données :
 
-   ```py
-   >>> from transformers import DataCollatorWithPadding
+```py
+>>> from transformers import DataCollatorWithPadding
 
-   >>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+>>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    ```
 
 Maintenant, rassemblez tous ces éléments dans un [`Trainer`] :
@@ -497,52 +511,64 @@ Pour les tâches - comme la traduction ou la génération de résumé - qui util
 </Tip>
 
 Vous pouvez personnaliser le comportement de la boucle d'apprentissage en redéfinissant les méthodes à l'intérieur de [`Trainer`]. Cela vous permet de personnaliser des caractéristiques telles que la fonction de perte, l'optimiseur et le planificateur. Consultez la documentation de [`Trainer`] pour savoir quelles méthodes peuvent être redéfinies.
+Une fois que vous êtes prêt, appelez la fonction [`~Trainer.train`] pour commencer l'entraînement :
 
-L'autre moyen de personnaliser la boucle d'apprentissage est d'utiliser les [Callbacks](./main_classes/callback). Vous pouvez utiliser les callbacks pour intégrer d'autres bibliothèques et inspecter la boucle d'apprentissage afin de suivre la progression ou d'arrêter l'apprentissage plus tôt. Les callbacks ne modifient rien dans la boucle d'apprentissage elle-même. Pour personnaliser quelque chose comme la fonction de perte, vous devez redéfinir le [`Trainer`] à la place.
+```py
+>>> trainer.train()  # doctest: +SKIP
+```
+
+<Tip>
+
+Pour les tâches - comme la traduction ou la génération de résumé - qui utilisent un modèle séquence à séquence, utilisez plutôt les classes [`Seq2SeqTrainer`] et [`Seq2SeqTrainingArguments`].
+
+</Tip>
 
+Vous pouvez personnaliser le comportement de la boucle d'apprentissage en redéfinissant les méthodes à l'intérieur de [`Trainer`]. Cela vous permet de personnaliser des caractéristiques telles que la fonction de perte, l'optimiseur et le planificateur. Consultez la documentation de [`Trainer`] pour savoir quelles méthodes peuvent être redéfinies.
+
+L'autre moyen de personnaliser la boucle d'apprentissage est d'utiliser les [Callbacks](./main_classes/callback). Vous pouvez utiliser les callbacks pour intégrer d'autres bibliothèques et inspecter la boucle d'apprentissage afin de suivre la progression ou d'arrêter l'apprentissage plus tôt. Les callbacks ne modifient rien dans la boucle d'apprentissage elle-même. Pour personnaliser quelque chose comme la fonction de perte, vous devez redéfinir le [`Trainer`] à la place.
 ## Entraînement avec TensorFlow
 
 Tous les modèles sont des modèles standard [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) afin qu'ils puissent être entraînés avec TensorFlow avec l'API [Keras](https://keras.io/). 🤗 Transformers fournit la fonction [`~TFPreTrainedModel.prepare_tf_dataset`] pour charger facilement votre jeu de données comme un `tf.data.Dataset` afin que vous puissiez commencer l'entraînement immédiatement avec les fonctions [`compile`](https://keras.io/api/models/model_training_apis/#compile-method) et [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) de Keras.
 
 1. Vous commencez avec un modèle [`TFPreTrainedModel`] ou [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) :
 
-   ```py
-   >>> from transformers import TFAutoModelForSequenceClassification
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
 
-   >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
    ```
 
 2. Une classe de prétraitement comme un tokenizer, un processeur d'images ou un extracteur de caractéristiques :
 
-   ```py
-   >>> from transformers import AutoTokenizer
+```py
+>>> from transformers import AutoTokenizer
 
-   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
    ```
 
 3. Créez une fonction qui transforme le texte du jeu de données en token :
 
-   ```py
-   >>> def tokenize_dataset(dataset):
-   ...     return tokenizer(dataset["text"])  # doctest: +SKIP
+```py
+>>> def tokenize_dataset(dataset):
+...     return tokenizer(dataset["text"])  # doctest: +SKIP
    ```
 
 4. Appliquez le tokenizer à l'ensemble du jeu de données avec [`~datasets.Dataset.map`] et passez ensuite le jeu de données et le tokenizer à [`~TFPreTrainedModel.prepare_tf_dataset`]. Vous pouvez également modifier la taille de l'échantillon et mélanger le jeu de données ici si vous le souhaitez :
 
-   ```py
-   >>> dataset = dataset.map(tokenize_dataset)  # doctest: +SKIP
-   >>> tf_dataset = model.prepare_tf_dataset(
-   ...     dataset, batch_size=16, shuffle=True, tokenizer=tokenizer
-   ... )  # doctest: +SKIP
+```py
+>>> dataset = dataset.map(tokenize_dataset)  # doctest: +SKIP
+>>> tf_dataset = model.prepare_tf_dataset(
+...     dataset, batch_size=16, shuffle=True, tokenizer=tokenizer
+... )  # doctest: +SKIP
    ```
 
 5. Une fois que vous êtes prêt, appelez les fonctions `compile` et `fit` pour commencer l'entraînement :
 
-   ```py
-   >>> from tensorflow.keras.optimizers import Adam
+```py
+>>> from tensorflow.keras.optimizers import Adam
 
-   >>> model.compile(optimizer=Adam(3e-5))
-   >>> model.fit(dataset)  # doctest: +SKIP
+>>> model.compile(optimizer=Adam(3e-5))
+>>> model.fit(dataset)  # doctest: +SKIP
    ```
 
 ## Et après ?
diff --git a/docs/source/ja/pipeline_tutorial.md b/docs/source/ja/pipeline_tutorial.md
index 5dbda5ce4d4a..6b2fa340dd95 100644
--- a/docs/source/ja/pipeline_tutorial.md
+++ b/docs/source/ja/pipeline_tutorial.md
@@ -67,10 +67,10 @@ Hubでは、ブラウザから直接モデルの結果をチェックして、
 
 ```py
 generator(
-    [
-        "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac",
-        "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac",
-    ]
+[
+"https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac",
+"https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac",
+]
 )
 ```
 
@@ -151,24 +151,25 @@ texts = generator(audio_filenames)
 各タスクごとに利用可能な多くのパラメータがありますので、何を調整できるかを確認するために各タスクのAPIリファレンスを確認してください！
 たとえば、[`~transformers.AutomaticSpeechRecognitionPipeline`]には、モデル単体では処理できない非常に長いオーディオファイル（たとえば、映画全体や1時間のビデオの字幕付けなど）で役立つ`chunk_length_s`パラメータがあります。
 
+新たに追加された`ImageTextToTextPipeline`では、画像とテキストを入力として受け取り、テキストを生成することができます。このパイプラインは、画像キャプション生成や視覚的質問応答などのマルチモーダルタスクに役立ちます。`return_full_text`や`return_tensors`などのパラメータを使用して、出力形式を制御することができます。
+
 <!--役立つパラメータが見つからない場合は、[リクエスト](https://github.com/huggingface/transformers/issues/new?assignees=&labels=feature&template=feature-request.yml)してください！-->
 
 役立つパラメータが見つからない場合は、[リクエスト](https://github.com/huggingface/transformers/issues/new?assignees=&labels=feature&template=feature-request.yml)してください！
-
 ## Using pipeline in a dataset
 
 パイプラインは大規模なデータセット上で推論を実行することもできます。これを行う最も簡単な方法は、イテレータを使用することです：
 
 ```py
 def data():
-    for i in range(1000):
-        yield f"My example {i}"
+for i in range(1000):
+yield f"My example {i}"
 
 
 pipe = pipeline(model="openai-community/gpt2", device=0)
 generated_characters = 0
 for out in pipe(data()):
-    generated_characters += len(out[0]["generated_text"])
+generated_characters += len(out[0]["generated_text"])
 ```
 
 イテレーター `data()` は各結果を生成し、パイプラインは自動的に入力が反復可能であることを認識し、データを取得し続けながらGPU上で処理を行います（これは[DataLoader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader)を内部で使用しています）。
@@ -187,7 +188,7 @@ pipe = pipeline(model="hf-internal-testing/tiny-random-wav2vec2", device=0)
 dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:10]")
 
 for out in pipe(KeyDataset(dataset, "audio")):
-    print(out)
+print(out)
 ```
 
 ## Using pipelines for a webserver
@@ -266,6 +267,23 @@ pip install pytesseract
 
 </Tip>
 
+### ImageTextToTextPipeline
+
+`ImageTextToTextPipeline`は、画像とテキストを入力として受け取り、テキストを生成するためのパイプラインです。このパイプラインは、画像キャプショニングや画像ベースのテキスト生成などのマルチモーダルタスクをサポートします。
+
+```python
+>>> from transformers import pipeline
+
+>>> image_text_to_text = pipeline(task="image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
+>>> output = image_text_to_text(
+...     images="https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
+...     text="A photo of"
+... )
+>>> output
+[{'generated_text': 'a photo of two birds'}]
+```
+
+このパイプラインは、チャット形式の入力もサポートしており、会話を続けることができます。
 ## Using `pipeline` on large models with 🤗 `accelerate`:
 
 まず、`accelerate` を`pip install accelerate` でインストールしていることを確認してください。
diff --git a/docs/source/ko/pipeline_tutorial.md b/docs/source/ko/pipeline_tutorial.md
index 2f166fc6939f..ff0034b73adb 100644
--- a/docs/source/ko/pipeline_tutorial.md
+++ b/docs/source/ko/pipeline_tutorial.md
@@ -65,10 +65,10 @@ Hub의 모델들은 여러 다양한 언어와 전문분야를 아우르기 때
 
 ```py
 generator(
-    [
-        "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac",
-        "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac",
-    ]
+[
+"https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac",
+"https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac",
+]
 )
 ```
 
@@ -142,24 +142,30 @@ texts = generator(audio_filenames)
 태스크마다 다양한 매개변수를 가지고 있는데요. 원하는 태스크의 API를 참조해서 바꿔볼 수 있는 여러 매개변수를 살펴보세요!
 지금까지 다뤄본 [`~transformers.AutomaticSpeechRecognitionPipeline`]에는 `chunk_length_s` 매개변수가 있습니다. 영화나 1시간 분량의 동영상의 자막 작업을 할 때처럼, 일반적으로 모델이 자체적으로 처리할 수 없는 매우 긴 오디오 파일을 처리할 때 유용하죠.
 
+또한, 새로운 `ImageTextToTextPipeline`을 사용하여 이미지와 텍스트를 입력으로 받아 텍스트를 생성할 수 있습니다. 이 파이프라인은 이미지 설명 생성, 시각적 질문 응답 등 다양한 멀티모달 작업에 유용합니다. 사용 예시는 다음과 같습니다:
 
-도움이 될 만한 매개변수를 찾지 못했다면 언제든지 [요청](https://github.com/huggingface/transformers/issues/new?assignees=&labels=feature&template=feature-request.yml)해주세요!
-
+```py
+>>> from transformers import pipeline
+>>> pipe = pipeline(task="image-text-to-text", model="Salesforce/blip-image-captioning-base")
+>>> pipe("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", text="A photo of")
+[{'generated_text': 'a photo of two birds'}]
+```
 
+도움이 될 만한 매개변수를 찾지 못했다면 언제든지 [요청](https://github.com/huggingface/transformers/issues/new?assignees=&labels=feature&template=feature-request.yml)해주세요!
 ## 데이터세트에서 Pipeline 사용하기[[using-pipelines-on-a-dataset]]
 
 파이프라인은 대규모 데이터세트에서도 추론 작업을 할 수 있습니다. 이때 이터레이터를 사용하는 걸 추천드립니다.
 
 ```py
 def data():
-    for i in range(1000):
-        yield f"My example {i}"
+for i in range(1000):
+yield f"My example {i}"
 
 
 pipe = pipe(model="openai-community/gpt2", device=0)
 generated_characters = 0
 for out in pipe(data()):
-    generated_characters += len(out["generated_text"])
+generated_characters += len(out["generated_text"])
 ```
 
 이터레이터 `data()`는 각 결과를 호출마다 생성하고, 파이프라인은 입력이 순회할 수 있는 자료구조임을 자동으로 인식하여 GPU에서 기존 데이터가 처리되는 동안 새로운 데이터를 가져오기 시작합니다.(이때 내부적으로 [DataLoader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader)를 사용해요.) 이 과정은 전체 데이터세트를 메모리에 적재하지 않고도 GPU에 최대한 빠르게 새로운 작업을 공급할 수 있기 때문에 중요합니다.
@@ -176,7 +182,7 @@ pipe = pipeline(model="hf-internal-testing/tiny-random-wav2vec2", device=0)
 dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:10]")
 
 for out in pipe(KeyDataset(dataset["audio"])):
-    print(out)
+print(out)
 ```
 
 
@@ -241,3 +247,53 @@ NLP 태스크를 위해 [`pipeline`]을 사용하는 일도 거의 동일합니
 ... )
 [{'score': 0.42514941096305847, 'answer': 'us-001', 'start': 16, 'end': 16}]
 ```
+
+### 이미지-텍스트-텍스트 Pipeline[[image-text-to-text-pipeline]]
+
+이미지와 텍스트를 입력으로 받아 텍스트를 생성하는 `ImageTextToTextPipeline`을 사용할 수 있습니다. 이 파이프라인은 이미지 캡셔닝, 이미지 기반 텍스트 생성과 같은 멀티모달 태스크를 수행할 수 있습니다. 이미지는 URL 또는 로컬 경로의 형태로 전달해주세요.
+
+예를 들어, 두 마리의 새가 있는 사진을 설명하고 싶다면:
+
+```py
+>>> from transformers import pipeline
+
+>>> pipe = pipeline(task="image-text-to-text", model="Salesforce/blip-image-captioning-base")
+>>> pipe("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", text="A photo of")
+[{'generated_text': 'a photo of two birds'}]
+```
+
+또는 대화 형식으로 이미지를 설명하고 싶다면:
+
+```py
+>>> from transformers import pipeline
+
+>>> pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+>>> messages = [
+>>>     {
+>>>         "role": "user",
+>>>         "content": [
+>>>             {
+>>>                 "type": "image",
+>>>                 "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+>>>             },
+>>>             {"type": "text", "text": "Describe this image."},
+>>>         ],
+>>>     },
+>>>     {
+>>>         "role": "assistant",
+>>>         "content": [
+>>>             {"type": "text", "text": "There is a dog and"},
+>>>         ],
+>>>     },
+>>> ]
+>>> pipe(text=messages, max_new_tokens=20, return_full_text=False)
+[{'input_text': [{'role': 'user',
+    'content': [{'type': 'image',
+    'url': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'},
+    {'type': 'text', 'text': 'Describe this image.'}]},
+{'role': 'assistant',
+    'content': [{'type': 'text', 'text': 'There is a dog and'}]}],
+'generated_text': ' a person in the image. The dog is sitting on the sand, and the person is sitting on'}]
+```
+
+`ImageTextToTextPipeline`은 "image-text-to-text" 태스크 식별자를 사용하여 `pipeline()`에서 로드할 수 있습니다. 사용 가능한 모델 목록은 [huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-text-to-text)에서 확인할 수 있습니다.
\ No newline at end of file