1- using LibGit2
2- using TextAnalysis
3- using Serialization
1+ """
2+ normalize_repo_url(input::String) -> String
43
4+ Normalizes a repository URL or name into a full GitHub URL.
5+
6+ # Arguments
7+ - `input::String`: The repository name (e.g., "owner/repo") or a full Git URL.
8+
9+ # Returns
10+ - A normalized GitHub URL ending with `.git`.
11+
12+ # Throws
13+ - An error if the input is not a valid repository name or URL.
14+
15+ # Behavior
16+ - If the input is already a valid Git URL (e.g., "https://github.com/owner/repo.git"), it is returned as-is.
17+ - If the input is a repository name in the format "owner/repo", it is converted to a full GitHub URL.
18+ - If the input is invalid, an error is raised.
19+
20+ # Example
21+ ```julia
22+ normalize_repo_url("JuliaLang/julia") # "https://github.com/JuliaLang/julia.git"
23+ normalize_repo_url("https://github.com/JuliaLang/julia") # "https://github.com/JuliaLang/julia.git"
24+ normalize_repo_url("invalid_repo") # Error: Invalid repository URL or name
25+ ```
26+ """
527function normalize_repo_url (input:: String )
628 if occursin (r" ^https?://" , input)
729 return endswith (input, " .git" ) ? input : input * " .git"
@@ -12,18 +34,119 @@ function normalize_repo_url(input::String)
1234 end
1335end
1436
37+ """
38+ clone_repo(repo_url::String, local_path::String)
39+
40+ Clones a Git repository to a specified local path.
41+
42+ # Arguments
43+ - `repo_url::String`: The URL of the repository to clone.
44+ - `local_path::String`: The local directory where the repository will be cloned.
45+
46+ # Behavior
47+ - If the cloning is successful, the repository is cloned to the specified path.
48+ - Prints progress messages during the cloning process.
49+
50+ # Throws
51+ - An error if the cloning process fails (e.g., invalid URL, network issues, or permission errors).
52+
53+ # Example
54+ ```julia
55+ clone_repo("https://github.com/JuliaLang/julia.git", "local_julia_repo")
56+ ```
57+ """
1558function clone_repo (repo_url:: String , local_path:: String )
1659 println (" Cloning repository from $repo_url to $local_path ..." )
1760 LibGit2. clone (repo_url, local_path)
1861 println (" Repository cloned successfully." )
1962end
2063
64+ """
65+ is_textual_file(filename::String) -> Bool
66+
67+ Checks if a file is a textual file based on its extension.
68+
69+ # Arguments
70+ - `filename::String`: The name of the file to check.
71+
72+ # Returns
73+ - `true` if the file has a textual extension (e.g., `.md`, `.jl`, `.txt`, etc.), otherwise `false`.
74+
75+ # Behavior
76+ - The function uses a regular expression to match common textual file extensions.
77+ - The check is case-insensitive.
78+
79+ # Example
80+ ```julia
81+ is_textual_file("README.md") # true
82+ is_textual_file("script.jl") # true
83+ is_textual_file("image.png") # false
84+ ```
85+ """
2186function is_textual_file (filename:: String )
2287 textual_extensions = r" \. (md|jl|txt|rst|doc|docx|py|cpp|c|h|java|sh)$"
2388 return occursin (textual_extensions, lowercase (filename))
2489end
2590
26- function build_corpus_from_repo (repo_input:: String )
91+ """
92+ process_directory(dir_path::String, documents::Vector{AbstractDocument})
93+
94+ Recursively processes a directory to extract textual files and add them to a document collection.
95+
96+ # Arguments
97+ - `dir_path::String`: The directory to process.
98+ - `documents::Vector{AbstractDocument}`: A vector to store the extracted documents.
99+
100+ # Behavior
101+ - Textual files are identified using the `is_textual_file` function.
102+ - The content of each textual file is read and converted into a `StringDocument` object, which is added to the `documents` vector.
103+ - Subdirectories are explored recursively, skipping hidden directories (those starting with a dot).
104+
105+ # Example
106+ ```julia
107+ documents = AbstractDocument[]
108+ process_directory("path/to/repo", documents)
109+ println("Number of documents processed: ", length(documents))
110+ ```
111+ """
112+ function process_directory (dir_path:: String , documents:: Vector{AbstractDocument} )
113+ for entry in readdir (dir_path, join= true )
114+ if isfile (entry) && is_textual_file (entry)
115+ println (" Processing file: $entry " )
116+ content = read (entry, String)
117+ doc = StringDocument (content)
118+ push! (documents, doc)
119+ elseif isdir (entry) && ! occursin (r" ^\. " , basename (entry))
120+ println (" Exploring directory: $entry ..." )
121+ process_directory (entry, documents) # Recursive call
122+ end
123+ end
124+ end
125+
126+ """
127+ build_corpus_from_repo(repo_input::String) -> Corpus
128+
129+ Clones a repository (if not already cloned), processes its files, and builds a corpus.
130+
131+ # Arguments
132+ - `repo_input::String`: The repository name (e.g., "owner/repo") or a full Git URL.
133+
134+ # Returns
135+ - A `Corpus` object containing the textual content of the repository.
136+
137+ # Behavior
138+ - The repository URL is normalized using `normalize_repo_url`.
139+ - If the repository is already cloned, it reuses the local copy; otherwise, it clones the repository using `clone_repo`.
140+ - The repository's directory is processed recursively using `process_directory` to extract textual files.
141+ - A `Corpus` object is created from the extracted documents, and its lexicon and index are updated.
142+
143+ # Example
144+ ```julia
145+ corpus = build_corpus_from_repo("JuliaLang/julia")
146+ println("Corpus contains ", length(corpus), " documents.")
147+ ```
148+ """
149+ function build_corpus_from_repo (repo_input:: String ):: Corpus
27150 repo_url = normalize_repo_url (repo_input)
28151 repo_name = split (repo_url, " /" )[end ] |> x -> replace (x, " .git" => " " )
29152 local_path = joinpath (pwd (), " repo_$repo_name " )
@@ -35,22 +158,7 @@ function build_corpus_from_repo(repo_input::String)
35158 end
36159
37160 documents = AbstractDocument[]
38-
39- function process_directory (dir_path)
40- for entry in readdir (dir_path, join= true )
41- if isfile (entry) && is_textual_file (entry)
42- println (" Processing file: $entry " )
43- content = read (entry, String)
44- doc = StringDocument (content)
45- push! (documents, doc)
46- elseif isdir (entry) && ! occursin (r" ^\. " , basename (entry))
47- println (" Exploring directory: $entry ..." )
48- process_directory (entry)
49- end
50- end
51- end
52-
53- process_directory (local_path)
161+ process_directory (local_path, documents) # Call the standalone function
54162
55163 corpus = Corpus (documents)
56164 println (" Updating corpus lexicon and index..." )
@@ -60,50 +168,54 @@ function build_corpus_from_repo(repo_input::String)
60168 return corpus
61169end
62170
171+ """
172+ save_corpus(corpus::Corpus, repo_input::String)
173+
174+ Saves a corpus to a file in the current working directory.
175+
176+ # Arguments
177+ - `corpus::Corpus`: The corpus to save.
178+ - `repo_input::String`: The repository name or URL (used to generate the filename).
179+
180+ # Behavior
181+ - The repository name is normalized using `normalize_repo_url` to create a safe filename.
182+ - The corpus is serialized and saved to a `.jls` file in the current working directory.
183+ - Ensures the target directory exists before saving the file.
184+ - Prints progress messages during the saving process.
185+
186+ # Throws
187+ - An error message is printed if the saving process fails (e.g., permission issues or invalid paths).
188+
189+ # Example
190+ ```julia
191+ corpus = build_corpus_from_repo("JuliaLang/julia")
192+ save_corpus(corpus, "JuliaLang/julia")
193+ ```
194+ """
63195function save_corpus (corpus:: Corpus , repo_input:: String )
64196 current_dir = pwd ()
65- safe_repo_name = split (normalize_repo_url (repo_input), " /" )[end ] |> x -> replace (x, " .git" => " " ) |> x -> replace (x, r" [^a-zA-Z0-9_]" => " _" )
197+ safe_repo_name = split (normalize_repo_url (repo_input), " /" )[end ] |>
198+ x -> replace (x, " .git" => " " ) |>
199+ x -> replace (x, r" [^a-zA-Z0-9_]" => " _" )
66200 filename = joinpath (current_dir, " corpus_$(safe_repo_name) .jls" )
67201
68- println (" Saving corpus to $filename ..." )
69- open (filename, " w" ) do io
70- serialize (io, corpus)
71- end
72- println (" Corpus saved successfully." )
73- end
74-
75- function main ()
76- repo_input = " "
77-
78- if isinteractive ()
79- println (" Please enter the GitHub repository URL or name (e.g., 'https://github.com/JuliaLang/julia.git' or 'JuliaLang/julia'):" )
80- repo_input = readline ()
81- else
82- if length (ARGS ) > 0
83- repo_input = ARGS [1 ]
84- println (" Using repository from command-line argument: $repo_input " )
85- else
86- repo_input = " https://github.com/JuliaLang/julia.git"
87- println (" No repository specified. Using default: $repo_input " )
88- end
89- end
90-
91- if isempty (strip (repo_input))
92- println (" Error: Repository input cannot be empty. Using default 'https://github.com/JuliaLang/julia.git'." )
93- repo_input = " https://github.com/JuliaLang/julia.git"
94- else
95- println (" Using repository: $repo_input " )
96- end
97-
98202 try
99- corpus = build_corpus_from_repo (repo_input)
100- save_corpus (corpus, repo_input)
101- return corpus
203+ println (" Saving corpus to $filename ..." )
204+
205+ # Ensure the directory exists
206+ dir_path = dirname (filename)
207+ if ! isdir (dir_path)
208+ println (" Directory $dir_path does not exist. Creating it..." )
209+ mkdir (dir_path)
210+ end
211+
212+ # Save the corpus to the file
213+ open (filename, " w" ) do io
214+ serialize (io, corpus)
215+ end
216+
217+ println (" Corpus saved successfully to $filename ." )
102218 catch e
103- println (" Error occurred: $e " )
104- println (" Please check the repository URL or name and ensure it is valid." )
105- return nothing
219+ println (" Failed to save corpus: $e " )
106220 end
107- end
108-
109- corpus = main ()
221+ end
0 commit comments