Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion dev-share-api.Tests/TestHost.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ public static IServiceProvider BuildTestServiceProvider()
.Build();

var services = new ServiceCollection();
services.AddApplicationServices(config);
services.AddInfrastructureServices(config)
.AddApplicationServices();

return services.BuildServiceProvider();
}
Expand Down
217 changes: 101 additions & 116 deletions dev-share-api/Controllers/ApiController.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
using System.Text;
using Executor;
using System.Collections.Concurrent;
using System.Text.Json;
using Newtonsoft.Json.Linq;


namespace Controllers;
Expand All @@ -15,25 +17,26 @@ namespace Controllers;
[Route("api")]
public class ExtractController : ControllerBase
{
private readonly ISummaryService _summaryService;

private readonly IEmbeddingService _embeddingService;
private readonly IVectorService _vectorService;
private readonly ShareChainExecutor _shareChainExecutor;
private readonly OnlineResearchService _onlineResearchService;
private readonly IResourceService _resourceService;
private readonly IOnlineResearchService _onlineResearchService;
private readonly IServiceScopeFactory _scopeFactory;
private static readonly ConcurrentDictionary<string, ShareTask> TaskStore = new();

public ExtractController(
ISummaryService summaryService,
IEmbeddingService embeddingService,
IVectorService vectorService,
ShareChainExecutor shareChainExecutor,
OnlineResearchService onlineResearchService)
IOnlineResearchService onlineResearchService,
IServiceScopeFactory scopeFactory,
IResourceService resourceService)
{
_summaryService = summaryService;
_embeddingService = embeddingService;
_vectorService = vectorService;
_shareChainExecutor = shareChainExecutor;
_onlineResearchService = onlineResearchService;
_scopeFactory = scopeFactory;
_resourceService = resourceService;
}

[HttpPost("share")]
Expand Down Expand Up @@ -67,26 +70,14 @@ public async Task<IActionResult> Share([FromBody] UrlRequest request)

_ = Task.Run(async () =>
{
using var scope = _scopeFactory.CreateScope();
var executor = scope.ServiceProvider.GetRequiredService<ShareChainExecutor>();
try
{
Console.WriteLine($"Extracting: {url}");
var result = TryHtmlAgilityPack(url);
if (string.IsNullOrWhiteSpace(result))
result = await TryPlaywright(url);
if (string.IsNullOrWhiteSpace(result))
throw new Exception("Content extraction failed.");

var prompt = new StringBuilder()
.AppendLine("You will receive an input text and your task is to summarize the article in no more than 100 words.")
.AppendLine("Only return the summary. Do not include any explanation.")
.AppendLine("# Article content:")
.AppendLine($"{result}")
.ToString();

await _shareChainExecutor.ExecuteAsync(new ResourceShareContext
await executor.ExecuteAsync(new ResourceShareContext
{
Url = url,
Prompt = prompt
Insight = request.Insight
});

task.Status = "success";
Expand Down Expand Up @@ -117,6 +108,62 @@ public IActionResult GetStatus(string taskId)
});
}

[HttpPost("search")]
public async Task<IActionResult> Search([FromBody] SearchRequest request)
{
if (string.IsNullOrWhiteSpace(request.Text))
{
return BadRequest(new { message = "Search text cannot be empty." });
}
if (request.TopRelatives <= 0 || request.TopRelatives > 100)
return BadRequest("TopRelatives must be between 1 and 100.");

try
{
//get vectordb data results
var resourceResults = await _vectorService.SearchResourceAsync(
query: request.Text,
topK: request.TopRelatives);

var insightResults = await _vectorService.SearchInsightAsync(
query: request.Text,
topK: request.TopRelatives);

if (resourceResults == null
|| resourceResults.Count == 0
|| insightResults == null
|| insightResults.Count == 0)
{
// Fallback to online research
var onlineResult = await _onlineResearchService.PerformOnlineResearchAsync(request.Text, request.TopRelatives);
return Ok(new { source = "online", result = onlineResult.ToList() });
}
else
{
//2. do rerank and get reranked list
var rerankResults = GetRerankedList(resourceResults, insightResults);

//3. get finalResults from sql server by id
var results = new List<ResourceDto>();
foreach (var item in rerankResults)
{
var resourceId = item.ResourceId;
var resource = await _resourceService.GetResourceById(long.Parse(resourceId));
if (resource != null)
{
results.Add(resource);
}
}
return Ok(new { source = "vector", result = results });
}
}
catch (Exception ex)
{
return StatusCode(500, "Search failed due to an internal error.");
}
}


[HttpPost("vector/init")]
public async Task<ActionResult<float[]>> InitVectorDB()
{
Expand Down Expand Up @@ -155,99 +202,37 @@ public async Task<IActionResult> ShareInsight([FromBody] ShareInsightRequest req
return Ok();
}

[HttpPost("search")]
public async Task<IActionResult> Search([FromBody] SearchRequest request)
//todo make sure the return data from service is List<Resource> and List<Insight>
private static List<Rerank> GetRerankedList(List<VectorResourceDto> resources, List<VectorInsightDto> insights)
{
if (string.IsNullOrWhiteSpace(request.Text))
{
return BadRequest("Search text cannot be empty.");
}
if (request.TopRelatives <= 0 || request.TopRelatives > 100)
return BadRequest("TopRelatives must be between 1 and 100.");

try
{
var resourceResults = await _vectorService.SearchResourceAsync(
query: request.Text,
topK: request.TopRelatives);

var insightResults = await _vectorService.SearchInsightAsync(
query: request.Text,
topK: request.TopRelatives);

if (resourceResults == null
|| resourceResults.Count == 0
|| insightResults == null
|| insightResults.Count == 0)
{
// Fallback to online research
var onlineResult = await _onlineResearchService.PerformOnlineResearchAsync(request.Text);
return Ok(new { source = "online", result = onlineResult });
}

return Ok(new { source = "vector", result = resourceResults });
}
catch (Exception)
{
return StatusCode(500, "Search failed due to an internal error.");
}
}

private string? TryHtmlAgilityPack(string url)
{
try
{
var web = new HtmlWeb
// averge comment.score
var insightGroups = insights
.GroupBy(c => c.ResourceId)
.ToDictionary(
g => g.Key,
g => g.Average(c => c.Score)
);

// content.score find table
var resourceScores = resources
.ToDictionary(c => c.Id, c => c.Score);

// union all contentId
var allResourceIds = resourceScores.Keys
.Union(insightGroups.Keys)
.Distinct();

var result = allResourceIds
.Select(id => new Rerank
{
// 设置 User-Agent,防止部分网站屏蔽爬虫
UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
"AppleWebKit/537.36 (KHTML, like Gecko) " +
"Chrome/120.0.0.0 Safari/537.36"
};
var doc = web.Load(url);

//TODO 编码问题
// using var client = new HttpClient();
// var bytes = client.GetByteArrayAsync(url).Result;
// var html = System.Text.Encoding.UTF8.GetString(bytes);
// var doc = new HtmlDocument();
// doc.LoadHtml(html);


// 提取网页标题
var titleNode = doc.DocumentNode.SelectSingleNode("//title");
Console.WriteLine("Title: " + titleNode?.InnerText);

// 提取所有段落文本
var paragraphs = doc.DocumentNode.SelectNodes("//p");
if (paragraphs == null) return null;

var title = titleNode?.InnerText.Trim() ?? "";
var paragraphText = string.Join("\n", paragraphs
.Select(p => p.InnerText.Trim())
.Where(t => !string.IsNullOrWhiteSpace(t)));

return title + "\n\n" + paragraphText;
}
catch
{
return null;
}
}

// 使用 Playwright 模拟浏览器加载网页并提取段落内容(用于 CSR 页面)
private async Task<string> TryPlaywright(string url)
{
// 启动 Playwright 浏览器(无头模式)
using var playwright = await Playwright.CreateAsync();
await using var browser = await playwright.Chromium.LaunchAsync(new() { Headless = true });

// 打开新页面并导航到目标地址,等待网络空闲(页面渲染完成)
var page = await browser.NewPageAsync();
await page.GotoAsync(url, new PageGotoOptions { WaitUntil = WaitUntilState.NetworkIdle });

// 提取所有 <p> 元素的 innerText,去除空行
var text = await page.EvalOnSelectorAllAsync<string[]>("p", "els => els.map(e => e.innerText).filter(t => t.trim().length > 0)");
return string.Join("\n", text);
ResourceId = id,
Score =
(resourceScores.TryGetValue(id, out var rScore) ? rScore : 0) * 0.7 +
(insightGroups.TryGetValue(id, out var iAvg) ? iAvg : 0) * 0.3
})
.OrderByDescending(r => r.Score)
.ToList();

return result;
}
}
4 changes: 2 additions & 2 deletions dev-share-api/Executor/ShareChainExecutor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

public async Task ExecuteAsync(ResourceShareContext context)
{
preHandle(context);
await preHandle(context);
foreach (var handler in _handlers)
{
// Check if the handler should be skipped
Expand All @@ -30,9 +30,9 @@
}
}

private async void preHandle(ResourceShareContext context)
private async Task preHandle(ResourceShareContext context)
{
ResourceDto resourceDto = await _resourceService.GetResourceByUrl(UrlManageUtil.NormalizeUrl(context.Url));

Check warning on line 35 in dev-share-api/Executor/ShareChainExecutor.cs

View workflow job for this annotation

GitHub Actions / ci

Converting null literal or possible null value to non-nullable type.

Check warning on line 35 in dev-share-api/Executor/ShareChainExecutor.cs

View workflow job for this annotation

GitHub Actions / ci

Possible null reference argument for parameter 'url' in 'string UrlManageUtil.NormalizeUrl(string url)'.
if (resourceDto != null)
{
context.ExistingResource = new ResourceDto()
Expand Down
36 changes: 31 additions & 5 deletions dev-share-api/Handle/DatabaseShareChainHandle.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,14 @@ namespace Services;
public class DatabaseShareChainHandle : BaseShareChainHandle
{
private readonly IVectorService _vectorService;
private readonly IUserInsightService _userInsightService;
private readonly IResourceService _resourceService;

public DatabaseShareChainHandle(IVectorService vectorService)
public DatabaseShareChainHandle(IVectorService vectorService, IUserInsightService userInsightService, IResourceService resourceService)
{
_vectorService = vectorService;
_userInsightService = userInsightService;
_resourceService = resourceService;
}

protected override void Validate(ResourceShareContext context)
Expand All @@ -19,23 +23,45 @@ protected override void Validate(ResourceShareContext context)

protected override async Task<HandlerResult> ProcessAsync(ResourceShareContext context)
{
var resourceId = IdGeneratorUtil.GetNextId().ToString();

var resourceId = 0L;

if (context.ExistingResource == null)
{
resourceId = IdGeneratorUtil.GetNextId();
await _resourceService.AddResourceAsync(
new ResourceDto
{
ResourceId = resourceId,
Content = context.Summary,
Url = context.Url
});
await _vectorService.UpsertResourceAsync(
resourceId.ToString(),
context.Url!,
resourceId,
context.Summary!,
context.ResourceVectors!);
}

else
{
resourceId = context.ExistingResource.ResourceId;
}

await _vectorService.UpsertInsightAsync(
IdGeneratorUtil.GetNextId().ToString(),
context.Url!,
context.Insight!,
resourceId,
resourceId.ToString(),
context.InsightVectors!);

await _userInsightService.AddUserInsightAsync(
new UserInsightDto
{
ResourceId = resourceId,
Content = context.Insight
});

return HandlerResult.Success();

}
}
5 changes: 4 additions & 1 deletion dev-share-api/Handle/EmbeddingShareChainHandle.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@ protected override void Validate(ResourceShareContext context)

protected override async Task<HandlerResult> ProcessAsync(ResourceShareContext context)
{
context.ResourceVectors = await GetVectors(context.Summary);
if(context.ExistingResource == null)
{
context.ResourceVectors = await GetVectors(context.Summary);
}

if (!string.IsNullOrWhiteSpace(context.Insight))
{
Expand Down
Loading
Loading