From 81381f75679d8326ea6f7a88aba684c8cce8addb Mon Sep 17 00:00:00 2001 From: Maxr1998 Date: Sat, 14 Jun 2025 02:25:21 +0200 Subject: [PATCH 1/2] Fix white space handling when parsing LRC with word time tags - Correctly handle white space between word time tags - Support parsing time tags that surround each non-blank segment --- .../Parser/Lrc/Lines/LrcLyricParserTest.cs | 3 +- .../Parser/Lrc/Utils/LrcTimedTextUtilsTest.cs | 53 +++++++-- LrcParser/Parser/Lrc/Lines/LrcLyricParser.cs | 2 +- .../Parser/Lrc/Utils/LrcTimedTextUtils.cs | 105 +++++++++++++----- 4 files changed, 128 insertions(+), 35 deletions(-) diff --git a/LrcParser.Tests/Parser/Lrc/Lines/LrcLyricParserTest.cs b/LrcParser.Tests/Parser/Lrc/Lines/LrcLyricParserTest.cs index 2cdd150..2e25873 100644 --- a/LrcParser.Tests/Parser/Lrc/Lines/LrcLyricParserTest.cs +++ b/LrcParser.Tests/Parser/Lrc/Lines/LrcLyricParserTest.cs @@ -50,7 +50,8 @@ public void TestDecode(string lyric, LrcLyric expected) { Text = "帰り道は", StartTimes = [17000], - TimeTags = TestCaseTagHelper.ParseTimeTags(["[1,start]:1000", "[2,start]:2000", "[3,start]:3000", "[3,end]:4000"]), + // [0,start]:17000 is created from the line time tag + TimeTags = TestCaseTagHelper.ParseTimeTags(["[0,start]:17000", "[1,start]:1000", "[2,start]:2000", "[3,start]:3000", "[3,end]:4000"]), }, ], [ diff --git a/LrcParser.Tests/Parser/Lrc/Utils/LrcTimedTextUtilsTest.cs b/LrcParser.Tests/Parser/Lrc/Utils/LrcTimedTextUtilsTest.cs index 2ff3923..9df9c4c 100644 --- a/LrcParser.Tests/Parser/Lrc/Utils/LrcTimedTextUtilsTest.cs +++ b/LrcParser.Tests/Parser/Lrc/Utils/LrcTimedTextUtilsTest.cs @@ -13,25 +13,64 @@ public class LrcTimedTextUtilsTest #region Decode [TestCase("<00:17.97>帰<00:18.37>り<00:18.55>道<00:18.94>は<00:19.22>", "帰り道は", new[] { "[0,start]:17970", "[1,start]:18370", "[2,start]:18550", "[3,start]:18940", "[3,end]:19220" })] - [TestCase(" <00:17.97>帰<00:18.37>り<00:18.55>道<00:18.94>は<00:19.22>", " 帰り道は", new[] { "[1,start]:17970", "[2,start]:18370", "[3,start]:18550", "[4,start]:18940", "[4,end]:19220" })] - [TestCase("<00:17.97>帰<00:18.37>り<00:18.55>道<00:18.94>は<00:19.22> ", "帰り道は ", new[] { "[0,start]:17970", "[1,start]:18370", "[2,start]:18550", "[3,start]:18940", "[3,end]:19220" })] - [TestCase("帰<00:18.37>り<00:18.55>道<00:18.94>は<00:19.22>", "帰り道は", new[] { "[1,start]:18370", "[2,start]:18550", "[3,start]:18940", "[3,end]:19220" })] + [TestCase(" <00:17.97>帰<00:18.37>り<00:18.55>道<00:18.94>は<00:19.22>", "帰り道は", new[] { "[0,start]:17970", "[1,start]:18370", "[2,start]:18550", "[3,start]:18940", "[3,end]:19220" })] + [TestCase("<00:17.97>帰<00:18.37>り<00:18.55>道<00:18.94>は<00:19.22> ", "帰り道は", new[] { "[0,start]:17970", "[1,start]:18370", "[2,start]:18550", "[3,start]:18940", "[3,end]:19220" })] + [TestCase("帰<00:18.37>り<00:18.55>道<00:18.94>は<00:19.22>", "帰り道は", new[] { "[0,start]:0", "[1,start]:18370", "[2,start]:18550", "[3,start]:18940", "[3,end]:19220" })] [TestCase("<00:17.97>帰<00:18.37>り<00:18.55>道<00:18.94>は", "帰り道は", new[] { "[0,start]:17970", "[1,start]:18370", "[2,start]:18550", "[3,start]:18940" })] [TestCase("帰り道は", "帰り道は", new string[] { })] [TestCase("", "", new string[] { })] + [TestCase(" ", "", new string[] { })] [TestCase(null, "", new string[] { })] + [TestCase("<00:51.00> <01:29.99><01:48.29> <02:31.00> <02:41.99>You gotta fight !", "You gotta fight !", new[] { "[0,start]:161990" })] // multiple empty tags public void TestDecode(string text, string expectedText, string[] expectedTimeTags) { - var (actualText, actualTimeTags) = LrcTimedTextUtils.TimedTextToObject(text); + var (actualText, actualTimeTags) = LrcTimedTextUtils.TimedTextToObject(text, 0); Assert.That(actualText, Is.EqualTo(expectedText)); Assert.That(actualTimeTags, Is.EqualTo(TestCaseTagHelper.ParseTimeTags(expectedTimeTags))); } - [TestCase("<00:51.00><01:29.99><01:48.29><02:31.00><02:41.99>You gotta fight !", "You gotta fight !", new[] { "[0,start]:51000" })] // decode with invalid format. - public void TestDecodeWithInvalidFormat(string text, string expectedText, string[] expectedTimeTags) + // Surrounding time tags + [TestCase( + "<00:06.84> Every <00:07.20> <00:07.56> night <00:07.87> <00:08.19> that <00:08.46> <00:08.79> goes <00:09.19> <00:09.59> between", 6840, + "Every night that goes between", + new[] { "[0,start]:6840", "[4,end]:7200", "[6,start]:7560", "[10,end]:7870", "[12,start]:8190", "[15,end]:8460", "[17,start]:8790", "[20,end]:9190", "[22,start]:9590" } + )] + // Alternating time tags, spaced on both sides + [TestCase( + "<00:06.84> Every <00:07.56> night <00:08.19> that <00:08.79> goes <00:09.59> between", 6840, "Every night that goes between", + new[] { "[0,start]:6840", "[6,start]:7560", "[12,start]:8190", "[17,start]:8790", "[22,start]:9590" } + )] + // Alternating time tags, unspaced + [TestCase( + "<00:06.84>Every<00:07.56>night<00:08.19>that<00:08.79>goes<00:09.59>between", 6840, "Everynightthatgoesbetween", + new[] { "[0,start]:6840", "[5,start]:7560", "[10,start]:8190", "[14,start]:8790", "[18,start]:9590" } + )] + [TestCase( + "Every<00:07.56>night<00:08.19>that<00:08.79>goes<00:09.59>between", 6840, "Everynightthatgoesbetween", + new[] { "[0,start]:6840", "[5,start]:7560", "[10,start]:8190", "[14,start]:8790", "[18,start]:9590" } + )] + // Alternating time tags, prefix spaced + [TestCase( + "<00:06.84> Every<00:07.56> night<00:08.19> that<00:08.79> goes<00:09.59> between", 6840, "Every night that goes between", + new[] { "[0,start]:6840", "[6,start]:7560", "[12,start]:8190", "[17,start]:8790", "[22,start]:9590" } + )] + [TestCase( + "Every<00:07.56> night<00:08.19> that<00:08.79> goes<00:09.59> between", 6840, "Every night that goes between", + new[] { "[0,start]:6840", "[6,start]:7560", "[12,start]:8190", "[17,start]:8790", "[22,start]:9590" } + )] + // Alternating time tags, postfix spaced + [TestCase( + "<00:06.84>Every <00:07.56>night <00:08.19>that <00:08.79>goes <00:09.59>between", 6840, "Every night that goes between", + new[] { "[0,start]:6840", "[6,start]:7560", "[12,start]:8190", "[17,start]:8790", "[22,start]:9590" } + )] + [TestCase( + "Every <00:07.56>night <00:08.19>that <00:08.79>goes <00:09.59>between", 6840, "Every night that goes between", + new[] { "[0,start]:6840", "[6,start]:7560", "[12,start]:8190", "[17,start]:8790", "[22,start]:9590" } + )] + public void TestDecodeWithStartTime(string text, int lineStartTime, string expectedText, string[] expectedTimeTags) { - var (actualText, actualTimeTags) = LrcTimedTextUtils.TimedTextToObject(text); + var (actualText, actualTimeTags) = LrcTimedTextUtils.TimedTextToObject(text, lineStartTime); Assert.That(actualText, Is.EqualTo(expectedText)); Assert.That(actualTimeTags, Is.EqualTo(TestCaseTagHelper.ParseTimeTags(expectedTimeTags))); diff --git a/LrcParser/Parser/Lrc/Lines/LrcLyricParser.cs b/LrcParser/Parser/Lrc/Lines/LrcLyricParser.cs index 0123e76..85ebe8b 100644 --- a/LrcParser/Parser/Lrc/Lines/LrcLyricParser.cs +++ b/LrcParser/Parser/Lrc/Lines/LrcLyricParser.cs @@ -36,7 +36,7 @@ public override LrcLyric Decode(string text) }; } - var (lyric, timeTags) = LrcTimedTextUtils.TimedTextToObject(rawLyric); + var (lyric, timeTags) = LrcTimedTextUtils.TimedTextToObject(rawLyric, startTimes[0]); return new LrcLyric { diff --git a/LrcParser/Parser/Lrc/Utils/LrcTimedTextUtils.cs b/LrcParser/Parser/Lrc/Utils/LrcTimedTextUtils.cs index 31d1ff0..034527f 100644 --- a/LrcParser/Parser/Lrc/Utils/LrcTimedTextUtils.cs +++ b/LrcParser/Parser/Lrc/Utils/LrcTimedTextUtils.cs @@ -1,6 +1,7 @@ // Copyright (c) karaoke.dev . Licensed under the MIT Licence. // See the LICENCE file in the repository root for full licence text. +using System.Text; using LrcParser.Model; using LrcParser.Utils; using static LrcParser.Parser.Lrc.Utils.TimeTagMode; @@ -10,53 +11,105 @@ namespace LrcParser.Parser.Lrc.Utils; internal static class LrcTimedTextUtils { /// - /// + /// Parses the passed text for word time tags. /// /// + /// /// - internal static Tuple> TimedTextToObject(string timedText) + internal static Tuple> TimedTextToObject(string timedText, int lineStartTime) { - if (string.IsNullOrEmpty(timedText)) + if (string.IsNullOrWhiteSpace(timedText)) + { return new Tuple>("", new SortedDictionary()); + } - var matchTimeTags = TimeTagUtils.WORD_TIME_TAG_REGEX.Matches(timedText); + var textLength = timedText.Length; + var lyricText = new StringBuilder(); + var timeTags = new SortedDictionary(); - var endTextIndex = timedText.Length; + var timeTagMatches = TimeTagUtils.WORD_TIME_TAG_REGEX.Matches(timedText); - var startIndex = 0; + if (timeTagMatches.Count == 0) + { + // no word time tags, return lyric as-is + return new Tuple>(timedText, new SortedDictionary()); + } - var text = string.Empty; - var timeTags = new SortedDictionary(); + var lastTimeTag = lineStartTime; + var segmentStartIndex = 0; + var insertSpace = false; + var lastTagWasStartTag = false; - foreach (var match in matchTimeTags.ToArray()) + foreach (var match in timeTagMatches.ToArray()) { - var endIndex = match.Index; + // Segment ends at the start of the next time tag + var segmentEndIndex = match.Index; + + var segment = timedText[segmentStartIndex..segmentEndIndex]; + + // Update next start index + segmentStartIndex = segmentEndIndex + match.Length; - if (startIndex < endIndex) + if (string.IsNullOrWhiteSpace(segment)) { - // add the text. - text += timedText[startIndex..endIndex]; + // The last segment was a start tag, and the next segment is empty, insert end tag + if (lastTagWasStartTag) + { + timeTags.TryAdd(new TextIndex(lyricText.Length - 1, IndexState.End), lastTimeTag); + lastTagWasStartTag = false; + } + + // Skip empty lyric, update start time + lastTimeTag = TimeTagUtils.ConvertTimeTagToMilliseconds(match.Value, WordTimeTag); + + // Segment contains only whitespace but isn't empty, insert a space before an upcoming valid segment. + if (segment.Length > 0) insertSpace = true; + continue; } - // update the new start for next time-tag calculation. - startIndex = endIndex + match.Length; + // If the last segment ended with whitespace, or the current starts with whitespace, + // insert a single space before the next segment. + if ((char.IsWhiteSpace(segment[0]) || insertSpace) && lyricText.Length > 0) + { + lyricText.Append(' '); + } - // add the time-tag. - var hasText = startIndex < endTextIndex; - var isEmptyStringNext = hasText && timedText[startIndex] == ' '; + // Add start time tag for next lyric + timeTags.TryAdd(new TextIndex(lyricText.Length), lastTimeTag); + lastTagWasStartTag = true; - var state = hasText && !isEmptyStringNext ? IndexState.Start : IndexState.End; - var textIndex = text.Length - (state == IndexState.Start ? 0 : 1); - var time = TimeTagUtils.ConvertTimeTagToMilliseconds(match.Value, WordTimeTag); + // Append lyric segment without surrounding whitespace + lyricText.Append(segment.Trim()); - // using try add because it might be possible with duplicated time-tag position in the lyric. - timeTags.TryAdd(new TextIndex(textIndex, state), time); + // Update start time for the next segment + lastTimeTag = TimeTagUtils.ConvertTimeTagToMilliseconds(match.Value, WordTimeTag); + + // Reset insertSpace flag after adding a segment, + // and instead track whether this new segment ends with whitespace + insertSpace = char.IsWhiteSpace(segment[^1]); } - // should add remaining text at the right of the end time-tag. - text += timedText[startIndex..endTextIndex]; + var remaining = timedText[segmentStartIndex..textLength]; + + if (!string.IsNullOrWhiteSpace(remaining)) + { + if ((char.IsWhiteSpace(remaining[0]) || insertSpace) && lyricText.Length > 0) + { + // Add space before the next segment + lyricText.Append(' '); + } + + // Add remaining text with start time tag + timeTags.TryAdd(new TextIndex(lyricText.Length), lastTimeTag); + lyricText.Append(remaining.Trim()); + } + else + { + // No remaining text, last time tag was end tag + timeTags.TryAdd(new TextIndex(lyricText.Length - 1, IndexState.End), lastTimeTag); + } - return new Tuple>(text, timeTags); + return new Tuple>(lyricText.ToString(), timeTags); } internal static string ToTimedText(string text, SortedDictionary timeTags) From 0c303d84b5df96a378d969f2de2525ec93bb0b2c Mon Sep 17 00:00:00 2001 From: andy840119 Date: Mon, 23 Jun 2025 22:17:59 +0800 Subject: [PATCH 2/2] Adjust the test case. --- .../Parser/Lrc/Utils/LrcTimedTextUtilsTest.cs | 46 +++++++++++-------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/LrcParser.Tests/Parser/Lrc/Utils/LrcTimedTextUtilsTest.cs b/LrcParser.Tests/Parser/Lrc/Utils/LrcTimedTextUtilsTest.cs index 9df9c4c..f0527d7 100644 --- a/LrcParser.Tests/Parser/Lrc/Utils/LrcTimedTextUtilsTest.cs +++ b/LrcParser.Tests/Parser/Lrc/Utils/LrcTimedTextUtilsTest.cs @@ -22,51 +22,59 @@ public class LrcTimedTextUtilsTest [TestCase(" ", "", new string[] { })] [TestCase(null, "", new string[] { })] [TestCase("<00:51.00> <01:29.99><01:48.29> <02:31.00> <02:41.99>You gotta fight !", "You gotta fight !", new[] { "[0,start]:161990" })] // multiple empty tags - public void TestDecode(string text, string expectedText, string[] expectedTimeTags) - { - var (actualText, actualTimeTags) = LrcTimedTextUtils.TimedTextToObject(text, 0); - - Assert.That(actualText, Is.EqualTo(expectedText)); - Assert.That(actualTimeTags, Is.EqualTo(TestCaseTagHelper.ParseTimeTags(expectedTimeTags))); - } - // Surrounding time tags [TestCase( - "<00:06.84> Every <00:07.20> <00:07.56> night <00:07.87> <00:08.19> that <00:08.46> <00:08.79> goes <00:09.19> <00:09.59> between", 6840, + "<00:06.84> Every <00:07.20> <00:07.56> night <00:07.87> <00:08.19> that <00:08.46> <00:08.79> goes <00:09.19> <00:09.59> between", "Every night that goes between", new[] { "[0,start]:6840", "[4,end]:7200", "[6,start]:7560", "[10,end]:7870", "[12,start]:8190", "[15,end]:8460", "[17,start]:8790", "[20,end]:9190", "[22,start]:9590" } )] // Alternating time tags, spaced on both sides [TestCase( - "<00:06.84> Every <00:07.56> night <00:08.19> that <00:08.79> goes <00:09.59> between", 6840, "Every night that goes between", + "<00:06.84> Every <00:07.56> night <00:08.19> that <00:08.79> goes <00:09.59> between", "Every night that goes between", new[] { "[0,start]:6840", "[6,start]:7560", "[12,start]:8190", "[17,start]:8790", "[22,start]:9590" } )] // Alternating time tags, unspaced [TestCase( - "<00:06.84>Every<00:07.56>night<00:08.19>that<00:08.79>goes<00:09.59>between", 6840, "Everynightthatgoesbetween", + "<00:06.84>Every<00:07.56>night<00:08.19>that<00:08.79>goes<00:09.59>between", "Everynightthatgoesbetween", new[] { "[0,start]:6840", "[5,start]:7560", "[10,start]:8190", "[14,start]:8790", "[18,start]:9590" } )] [TestCase( - "Every<00:07.56>night<00:08.19>that<00:08.79>goes<00:09.59>between", 6840, "Everynightthatgoesbetween", - new[] { "[0,start]:6840", "[5,start]:7560", "[10,start]:8190", "[14,start]:8790", "[18,start]:9590" } + "Every<00:07.56>night<00:08.19>that<00:08.79>goes<00:09.59>between", "Everynightthatgoesbetween", + new[] { "[0,start]:0", "[5,start]:7560", "[10,start]:8190", "[14,start]:8790", "[18,start]:9590" } )] // Alternating time tags, prefix spaced [TestCase( - "<00:06.84> Every<00:07.56> night<00:08.19> that<00:08.79> goes<00:09.59> between", 6840, "Every night that goes between", + "<00:06.84> Every<00:07.56> night<00:08.19> that<00:08.79> goes<00:09.59> between", "Every night that goes between", new[] { "[0,start]:6840", "[6,start]:7560", "[12,start]:8190", "[17,start]:8790", "[22,start]:9590" } )] [TestCase( - "Every<00:07.56> night<00:08.19> that<00:08.79> goes<00:09.59> between", 6840, "Every night that goes between", - new[] { "[0,start]:6840", "[6,start]:7560", "[12,start]:8190", "[17,start]:8790", "[22,start]:9590" } + "Every<00:07.56> night<00:08.19> that<00:08.79> goes<00:09.59> between", "Every night that goes between", + new[] { "[0,start]:0", "[6,start]:7560", "[12,start]:8190", "[17,start]:8790", "[22,start]:9590" } )] // Alternating time tags, postfix spaced [TestCase( - "<00:06.84>Every <00:07.56>night <00:08.19>that <00:08.79>goes <00:09.59>between", 6840, "Every night that goes between", + "<00:06.84>Every <00:07.56>night <00:08.19>that <00:08.79>goes <00:09.59>between", "Every night that goes between", new[] { "[0,start]:6840", "[6,start]:7560", "[12,start]:8190", "[17,start]:8790", "[22,start]:9590" } )] [TestCase( - "Every <00:07.56>night <00:08.19>that <00:08.79>goes <00:09.59>between", 6840, "Every night that goes between", - new[] { "[0,start]:6840", "[6,start]:7560", "[12,start]:8190", "[17,start]:8790", "[22,start]:9590" } + "Every <00:07.56>night <00:08.19>that <00:08.79>goes <00:09.59>between", "Every night that goes between", + new[] { "[0,start]:0", "[6,start]:7560", "[12,start]:8190", "[17,start]:8790", "[22,start]:9590" } + )] + public void TestDecode(string text, string expectedText, string[] expectedTimeTags) + { + var (actualText, actualTimeTags) = LrcTimedTextUtils.TimedTextToObject(text, 0); + + Assert.That(actualText, Is.EqualTo(expectedText)); + Assert.That(actualTimeTags, Is.EqualTo(TestCaseTagHelper.ParseTimeTags(expectedTimeTags))); + } + + [TestCase( + "<00:06.84>Every<00:07.56>night<00:08.19>that<00:08.79>goes<00:09.59>between", 6840, "Everynightthatgoesbetween", + new[] { "[0,start]:6840", "[5,start]:7560", "[10,start]:8190", "[14,start]:8790", "[18,start]:9590" } + )] + [TestCase( + "Every<00:07.56>night<00:08.19>that<00:08.79>goes<00:09.59>between", 6840, "Everynightthatgoesbetween", + new[] { "[0,start]:6840", "[5,start]:7560", "[10,start]:8190", "[14,start]:8790", "[18,start]:9590" } )] public void TestDecodeWithStartTime(string text, int lineStartTime, string expectedText, string[] expectedTimeTags) {