1+ import logging
12from collections import Counter
23from pathlib import Path
34from typing import Literal
45
56from codegen .git .utils .file_utils import split_git_path
67from codegen .shared .enums .programming_language import ProgrammingLanguage
78
9+ logger = logging .getLogger (__name__ )
10+
11+ # Minimum ratio of files that must match the dominant language
12+ MIN_LANGUAGE_RATIO = 0.1
13+
814
915def determine_project_language (folder_path : str , strategy : Literal ["most_common" , "git_most_common" , "package_json" ] = "git_most_common" ) -> ProgrammingLanguage :
1016 """Determines the primary programming language of a project.
@@ -37,7 +43,8 @@ def _determine_language_by_file_count(folder_path: str) -> ProgrammingLanguage:
3743 folder_path (str): Path to the folder to analyze
3844
3945 Returns:
40- ProgrammingLanguage: The dominant programming language, or UNSUPPORTED if no matching files found
46+ ProgrammingLanguage: The dominant programming language, or OTHER if no matching files found
47+ or if less than MIN_LANGUAGE_RATIO of files match the dominant language
4148 """
4249 from codegen .sdk .python import PyFile
4350 from codegen .sdk .typescript .file import TSFile
@@ -54,6 +61,7 @@ def _determine_language_by_file_count(folder_path: str) -> ProgrammingLanguage:
5461
5562 # Initialize counters for each language
5663 language_counts = Counter ()
64+ total_files = 0
5765
5866 # Walk through the directory
5967 for file_path in folder .rglob ("*" ):
@@ -65,17 +73,27 @@ def _determine_language_by_file_count(folder_path: str) -> ProgrammingLanguage:
6573 if any (ignore in str (file_path ) for ignore in [".git" , "node_modules" , "__pycache__" , "venv" , ".env" ]):
6674 continue
6775
76+ total_files += 1
77+
6878 # Count files for each language based on extensions
6979 for language , exts in EXTENSIONS .items ():
7080 if file_path .suffix in exts :
7181 language_counts [language ] += 1
7282
7383 # If no files found, return None
7484 if not language_counts :
75- return ProgrammingLanguage .UNSUPPORTED
85+ return ProgrammingLanguage .OTHER
86+
87+ # Get the most common language and its count
88+ most_common_language , count = language_counts .most_common (1 )[0 ]
89+
90+ logger .debug (f"Most common language: { most_common_language } , count: { count } , total files: { total_files } " )
7691
77- # Return the language with the highest count
78- return language_counts .most_common (1 )[0 ][0 ]
92+ # Check if the most common language makes up at least MIN_LANGUAGE_RATIO of all files
93+ if total_files > 0 and (count / total_files ) < MIN_LANGUAGE_RATIO :
94+ return ProgrammingLanguage .OTHER
95+
96+ return most_common_language
7997
8098
8199def _determine_language_by_git_file_count (folder_path : str ) -> ProgrammingLanguage :
@@ -86,7 +104,8 @@ def _determine_language_by_git_file_count(folder_path: str) -> ProgrammingLangua
86104 folder_path (str): Path to the git repo to analyze
87105
88106 Returns:
89- ProgrammingLanguage: The dominant programming language, or UNSUPPORTED if no matching files found
107+ ProgrammingLanguage: The dominant programming language, or OTHER if no matching files found
108+ or if less than MIN_LANGUAGE_RATIO of files match the dominant language
90109 """
91110 from codegen .git .repo_operator .repo_operator import RepoOperator
92111 from codegen .git .schemas .repo_config import RepoConfig
@@ -105,6 +124,7 @@ def _determine_language_by_git_file_count(folder_path: str) -> ProgrammingLangua
105124
106125 # Initialize counters for each language
107126 language_counts = Counter ()
127+ total_files = 0
108128
109129 # Initiate RepoOperator
110130 git_root , base_path = split_git_path (folder_path )
@@ -120,17 +140,27 @@ def _determine_language_by_git_file_count(folder_path: str) -> ProgrammingLangua
120140 if file_path .is_dir () or file_path .name .startswith ("." ):
121141 continue
122142
143+ total_files += 1
144+
123145 # Count files for each language based on extensions
124146 for language , exts in EXTENSIONS .items ():
125147 if file_path .suffix in exts :
126148 language_counts [language ] += 1
127149
128150 # If no files found, return None
129151 if not language_counts :
130- return ProgrammingLanguage .UNSUPPORTED
152+ return ProgrammingLanguage .OTHER
153+
154+ # Get the most common language and its count
155+ most_common_language , count = language_counts .most_common (1 )[0 ]
156+
157+ logger .debug (f"Most common language: { most_common_language } , count: { count } , total files: { total_files } " )
158+
159+ # Check if the most common language makes up at least MIN_LANGUAGE_RATIO of all files
160+ if total_files > 0 and (count / total_files ) < MIN_LANGUAGE_RATIO :
161+ return ProgrammingLanguage .OTHER
131162
132- # Return the language with the highest count
133- return language_counts .most_common (1 )[0 ][0 ]
163+ return most_common_language
134164
135165
136166def _determine_language_by_package_json (folder_path : str ) -> ProgrammingLanguage :
@@ -145,6 +175,8 @@ def _determine_language_by_package_json(folder_path: str) -> ProgrammingLanguage
145175 """
146176 package_json_path = Path (folder_path ) / "package.json"
147177 if package_json_path .exists ():
178+ logger .debug (f"Found package.json at { package_json_path } " )
148179 return ProgrammingLanguage .TYPESCRIPT
149180 else :
181+ logger .debug (f"No package.json found at { package_json_path } " )
150182 return ProgrammingLanguage .PYTHON
0 commit comments