diff --git a/assets/core-video-nodes/workflow.svg b/assets/core-video-nodes/workflow.svg new file mode 100644 index 0000000..70a99e3 --- /dev/null +++ b/assets/core-video-nodes/workflow.svg @@ -0,0 +1 @@ +T: 0.00sI: 0N: 10 [10]V: 66FPS:58.82Codec From Video Stream#6 CoreVideoMocksvideo_streamcodecDecode Video Stream#2 CoreVideoMocksstreamimagesframe_interval1start_frame0end_frame-1Video Stream Data#7 CoreVideoMocksvideo_streamframes_per_secondframe_countdurationwidthheightInvert Image#4 🦊imageIMAGECombine Video#8 CoreVideoMocksvideo_streamaudio_streamsubtitle_streamcombined_videocontainerwebm680 × 680Load Video#1 CoreVideoMocksvideovideoGjwv3x4acAAYq9C.jpgchoose file to uploadEncode Video Stream#5 CoreVideoMocksimagescodecframes_per_secondencoded_streamSave Video#9 CoreVideoMocksvideo_streamfilename_prefixvideo_outputSplit Video#3 CoreVideoMocksvideovideo_streamsaudio_streamssubtitle_streamsGet Video Stream#10 CoreVideoMocksstreamsstreamstream_index0{"last_node_id":10,"last_link_id":20,"nodes":[{"id":6,"type":"CoreVideoMocks:CodecFromVideoStream","pos":[490,330],"size":[210,58],"flags":{},"order":4,"mode":0,"inputs":[{"name":"video_stream","type":"STRING","widget":{"name":"video_stream"},"link":18}],"outputs":[{"name":"codec","localized_name":"codec","type":"STRING","links":[7],"slot_index":0}],"properties":{"aux_id":"Immac/ComfyUI-CoreVideoMocks","ver":"7616e19e14eb7f3c8d280dc70c09af69dd6305f9","Node name for S&R":"CoreVideoMocks:CodecFromVideoStream"},"widgets_values":[""]},{"id":2,"type":"CoreVideoMocks:DecodeVideoStream","pos":[490,430],"size":[315,130],"flags":{},"order":3,"mode":0,"inputs":[{"name":"stream","type":"STRING","widget":{"name":"stream"},"link":17}],"outputs":[{"name":"images","localized_name":"images","type":"IMAGE","links":[4],"slot_index":0}],"properties":{"aux_id":"Immac/ComfyUI-CoreVideoMocks","ver":"7616e19e14eb7f3c8d280dc70c09af69dd6305f9","Node name for S&R":"CoreVideoMocks:DecodeVideoStream"},"widgets_values":["",1,0,-1]},{"id":7,"type":"CoreVideoMocks:VideoStreamData","pos":[490,600],"size":[253.60000610351562,138],"flags":{},"order":5,"mode":0,"inputs":[{"name":"video_stream","type":"STRING","widget":{"name":"video_stream"},"link":19}],"outputs":[{"name":"frames_per_second","localized_name":"frames_per_second","type":"FLOAT","links":[9],"slot_index":0},{"name":"frame_count","localized_name":"frame_count","type":"INT","links":null,"slot_index":1},{"name":"duration","localized_name":"duration","type":"FLOAT","links":null},{"name":"width","localized_name":"width","type":"INT","links":null},{"name":"height","localized_name":"height","type":"INT","links":null}],"properties":{"aux_id":"Immac/ComfyUI-CoreVideoMocks","ver":"7616e19e14eb7f3c8d280dc70c09af69dd6305f9","Node name for S&R":"CoreVideoMocks:VideoStreamData"},"widgets_values":[""]},{"id":4,"type":"ImageInvert","pos":[840,330],"size":[210,26],"flags":{},"order":6,"mode":0,"inputs":[{"name":"image","localized_name":"image","type":"IMAGE","link":4}],"outputs":[{"name":"IMAGE","localized_name":"IMAGE","type":"IMAGE","links":[5],"slot_index":0}],"properties":{"cnr_id":"comfy-core","ver":"0.3.26","Node name for S&R":"ImageInvert"}},{"id":8,"type":"CoreVideoMocks:CombineVideo","pos":[1410,140],"size":[315,170],"flags":{},"order":8,"mode":0,"inputs":[{"name":"video_stream","type":"STRING","widget":{"name":"video_stream"},"link":10},{"name":"audio_stream","type":"STRING","widget":{"name":"audio_stream"},"link":11},{"name":"subtitle_stream","type":"STRING","widget":{"name":"subtitle_stream"},"link":12}],"outputs":[{"name":"combined_video","localized_name":"combined_video","type":"STRING","links":[13],"slot_index":0}],"properties":{"aux_id":"Immac/ComfyUI-CoreVideoMocks","ver":"7616e19e14eb7f3c8d280dc70c09af69dd6305f9","Node name for S&R":"CoreVideoMocks:CombineVideo"},"widgets_values":["","","","webm"]},{"id":1,"type":"CoreVideoMocks:LoadVideo","pos":[-150,260],"size":[315,294],"flags":{},"order":0,"mode":0,"inputs":[],"outputs":[{"name":"video","localized_name":"video","type":"STRING","links":[2],"slot_index":0}],"properties":{"aux_id":"Immac/ComfyUI-CoreVideoMocks","ver":"7616e19e14eb7f3c8d280dc70c09af69dd6305f9","Node name for S&R":"CoreVideoMocks:LoadVideo"},"widgets_values":["Gjwv3x4acAAYq9C.jpg","image"]},{"id":5,"type":"CoreVideoMocks:EncodeVideoStream","pos":[1070,460],"size":[270.3999938964844,98],"flags":{},"order":7,"mode":0,"inputs":[{"name":"images","localized_name":"images","type":"IMAGE","link":5},{"name":"codec","localized_name":"codec","type":0,"link":7},{"name":"frames_per_second","type":"FLOAT","widget":{"name":"frames_per_second"},"link":9}],"outputs":[{"name":"encoded_stream","localized_name":"encoded_stream","type":"STRING","links":[10],"slot_index":0}],"properties":{"aux_id":"Immac/ComfyUI-CoreVideoMocks","ver":"7616e19e14eb7f3c8d280dc70c09af69dd6305f9","Node name for S&R":"CoreVideoMocks:EncodeVideoStream"},"widgets_values":[30]},{"id":9,"type":"CoreVideoMocks:SaveVideo","pos":[1420,350],"size":[315,82],"flags":{},"order":9,"mode":0,"inputs":[{"name":"video_stream","type":"STRING","widget":{"name":"video_stream"},"link":13}],"outputs":[],"properties":{"aux_id":"Immac/ComfyUI-CoreVideoMocks","ver":"7616e19e14eb7f3c8d280dc70c09af69dd6305f9","Node name for S&R":"CoreVideoMocks:SaveVideo"},"widgets_values":["","video_output"]},{"id":3,"type":"CoreVideoMocks:SplitVideo","pos":[-140,620],"size":[315,98],"flags":{},"order":1,"mode":0,"inputs":[{"name":"video","type":"STRING","widget":{"name":"video"},"link":2}],"outputs":[{"name":"video_streams","localized_name":"video_streams","type":"STRING","links":[20],"slot_index":0},{"name":"audio_streams","localized_name":"audio_streams","type":"STRING","links":[11],"slot_index":1},{"name":"subtitle_streams","localized_name":"subtitle_streams","type":"STRING","links":[12],"slot_index":2}],"properties":{"aux_id":"Immac/ComfyUI-CoreVideoMocks","ver":"7616e19e14eb7f3c8d280dc70c09af69dd6305f9","Node name for S&R":"CoreVideoMocks:SplitVideo"},"widgets_values":[""]},{"id":10,"type":"CoreVideoMocks:GetVideoStream","pos":[240,600],"size":[210,82],"flags":{},"order":2,"mode":0,"inputs":[{"name":"streams","type":"STRING","widget":{"name":"streams"},"link":20}],"outputs":[{"name":"stream","localized_name":"stream","type":"STRING","links":[17,18,19]}],"properties":{"aux_id":"Immac/ComfyUI-CoreVideoMocks","ver":"7616e19e14eb7f3c8d280dc70c09af69dd6305f9","Node name for S&R":"CoreVideoMocks:GetVideoStream"},"widgets_values":["",0]}],"links":[[2,1,0,3,0,"STRING"],[4,2,0,4,0,"IMAGE"],[5,4,0,5,0,"IMAGE"],[7,6,0,5,1,"STRING"],[9,7,0,5,2,"FLOAT"],[10,5,0,8,0,"STRING"],[11,3,1,8,1,"STRING"],[12,3,2,8,2,"STRING"],[13,8,0,9,0,"STRING"],[17,10,0,2,0,"STRING"],[18,10,0,6,0,"STRING"],[19,10,0,7,0,"STRING"],[20,3,0,10,0,"STRING"]],"groups":[],"config":{},"extra":{"ds":{"scale":1,"offset":[250,-40]},"reroutes":[{"id":3,"pos":[410,650],"linkIds":[]},{"id":4,"pos":[440,210],"linkIds":[11]},{"id":5,"pos":[440,240],"linkIds":[12]}],"linkExtensions":[{"id":11,"parentId":4},{"id":12,"parentId":5}]},"version":0.4} \ No newline at end of file diff --git a/rfcs/####-Core Video Nodes.md b/rfcs/####-Core Video Nodes.md new file mode 100644 index 0000000..3cfaebf --- /dev/null +++ b/rfcs/####-Core Video Nodes.md @@ -0,0 +1,87 @@ +# RFC: Core Video Nodes + +- Start Date: 2025/03/15 +- Target Major Version: TBD +- Reference Issues: N/A +- Implementation PR: + +## Summary + +This RFC proposes the integration of fundamental video input and output functionalities into ComfyUI. Specifically, features for loading, splitting (demuxing), decoding, encoding, combining (muxing), and saving video files directly within the ComfyUI core. This enhancement aims to streamline video-related workflows, broaden ComfyUI's applicability, and reduce reliance on external custom nodes for basic video operations. + +## Basic example + +A basic example of the proposed Node usage: +![alt text](../assets/core-video-nodes/workflow.svg) +## Motivation + +Video generation has become ubiquitous, and new video models that can run on consumer-grade hardware are becoming common. This means that having better control over video and its various components directly in the Comfy Core would provide a solid base for usage and experimentation for users and developers. + +The most popular third-party Node (VHS) abstracts away several aspects of what a video file contains. If further development on video is to be explored, ComfyUI needs to be able to handle these more fundamental aspects, at least in a general sense. + +This RFC focuses on the following: +- **Loading**: It is currently not possible to load a video using only ComfyUI Core; third-party nodes are required. +- **Demultiplexing (Demuxing/Splitting Video)**: Videos are more than just a series of images. They can contain audio, subtitles, and even multiple tracks of the same type (even video). ComfyUI should be able to handle these situations, including corner cases. +- **Decoding**: Once the video streams are demuxed, they need to be decoded. This process converts the compressed video data into a sequence of uncompressed frames. Due to possible licensing issues, ComfyUI Core should prioritize support for open codecs like VP9 and AV1. +- **Encoding**: After processing the video frames, they need to be encoded back into a compressed video stream. The main reason to manipulate streams is to combine them arbitrarily. +- **Multiplexing (Muxing/Combining Video)**: Once the individual video, audio, and subtitle streams have been processed and encoded, ComfyUI needs to provide a method for combining them into a final video container. To start with, ComfyUI should support: + - **WebM Container Format**: Prioritizing the WebM container format, which is well-suited for AV1 and VP9 and offers excellent compatibility with web browsers and open-source tools. + - **Arbitrary Stream Combinations**: Allowing users to combine any number of AV1 or VP9 video streams, along with audio and subtitle streams, including those with different resolutions, frame rates, and durations. +- **Saving**: The video file can be saved without requiring many additional options on the node, as those would have been handled by the encoding and multiplexing steps. + +## Detailed design + +This is the bulk of the RFC. Explain the design in enough detail for somebody +familiar with Vue to understand, and for somebody familiar with the +implementation to implement. This should get into specifics and corner cases, +and include examples of how the feature is used. Any new terminology should be +defined here. + +I have provided a [set of mock custom nodes](https://github.com/Immac/ComfyUI-CoreVideoMocks) that represent what I believe is a good starting point for this development. Please note that these are just mocks and thus connect to anything, so use the provided [workflow](https://github.com/Immac/ComfyUI-CoreVideoMocks/blob/master/Mock%20Video%20Workflows.json) to understand +how they are supposed to be used. + +![alt text](../assets/core-video-nodes/workflow.svg) + +The node `SaveWEBM` already does a good amount of what is needed; it just needs to be redistributed to be handled by several nodes. The main idea is to use pyAv and/or pyffmpeg to implement each node as needed. At the time of writing, I have had some issues with pyAv while trying their examples, as seen [here](https://github.com/PyAV-Org/PyAV/discussions/1769). So the version used should be thoroughly tested. +My current investigation into this topic is very limited and summarized [here](https://github.com/Immac/video_manipulation_with_python). Nevertheless, the `SaveWEBM` node serves as an excellent starting point for further development. + +Regarding the actual coding of the nodes, having a _generic mechanism_ for nodes to receive complete functions from their inputs would be a way to separately implement codecs, similar to how it's done with KSamplers. However, I believe it can be done even more _generic_. I have not studied the code extensively enough for a formal proposal. + +## Drawbacks + +- There is already a way to load and save videos, so if there are more pressing matters, this is not completely necessary. +- Video encoding tends to be fickle, so creating tests for the various moving parts will be challenging. +- People might misunderstand the nature of `Streams`, so some education on how video files actually work could be needed. + +## Alternatives + +An alternative approach is to adopt the video node implementations provided by VHS. + +While basic video processing is currently supported, as video generation needs increase, more granular control over video and its components will become essential for efficiency and advanced workflows. Not implementing this would require ComfyUI to rely on more custom nodes to fill the gap. + +## Adoption strategy + +It only adds new nodes, so there should be no conflicts with existing workflows. Adding the nodes as BETA and giving them some time to be tested would be fine. + +- Create the necessary nodes to go from video to video, excluding audio and subtitles, and add an experimental flag to set them as `BETA`. +- Create examples and documentation on intended usage. +- Gradually add the missing streams. + +## Miscellaneous + +Decoding can produce three types of frames: I-frames (Intra-coded), P-frames (Predicted), and B-frames (Bi-predictive). For image generation, when extracting frames as individual images, each will be a full, independent image, regardless of its original frame type. However, being able to distinguish between these frame types might be useful for future features or optimizations. + +Some practical uses for videos with multiple video streams: +- Adaptive bitrate streaming (where different quality versions of the video are stored in the same file). +- Videos with multiple camera angles. +- Videos with different language versions. + +## Unresolved questions + +The actual code inside the nodes is to be determined still; video encoding can be messy. + +How will the separate new Video Container and Video/Audio/Subtitles Streams handle metadata? Once separated from its parent Container, it could be advantageous for a Stream to keep most of the relevant information. I do not know if it's done that way already. + +There also needs to be a mechanism for managing the resource consumption of the previews when loading videos. + +I have probably overlooked a lot of things, but I hope this is comprehensive enough to be a start.