Skip to content

Commit 54c2baf

Browse files
Added docstrings to the code and made JuliaHealthLLM, a package
1 parent 0ef8e86 commit 54c2baf

File tree

8 files changed

+295
-110
lines changed

8 files changed

+295
-110
lines changed

Manifest.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
julia_version = "1.10.5"
44
manifest_format = "2.0"
5-
project_hash = "7466aa371ae0c2e72224798b8708eaac8f0c47a6"
5+
project_hash = "714c8f58ac5b36d35301b506df3812f7dc600a22"
66

77
[[deps.AliasTables]]
88
deps = ["PtrArrays", "Random"]

Project.toml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,6 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
66
DotEnv = "4dc1fcf4-5e3b-5448-94ab-0c38ec0385c1"
77
Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
88
DrWatson = "634d3b9d-ee7a-5ddf-bec9-22491ea816e1"
9-
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
10-
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
11-
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
129
Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
1310
TextAnalysis = "a2db99b7-8b79-58f8-94bf-bbc811eef33d"
1411

scripts/clone.jl

Lines changed: 39 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,48 @@
1-
using LibGit2
2-
using Downloads
1+
"""
2+
clone_repositories(repos::Vector{String}, data_dir::String)
33
4+
Clones a list of Git repositories into the specified directory.
5+
6+
# Arguments
7+
- `repos::Vector{String}`: A vector of repository URLs to be cloned.
8+
- `data_dir::String`: The directory where the repositories will be cloned.
9+
10+
# Behavior
11+
- If the target directory does not exist, it will be created.
12+
- If a repository has already been cloned (i.e., its directory exists), it will be skipped.
13+
- If cloning fails for any repository, an error message will be printed.
14+
15+
# Example
16+
```julia
417
repos = [
5-
"https://github.com/JuliaHealth/MedEval3D.jl.git",
18+
"https://github.com/JuliaLang/julia.git",
19+
"https://github.com/JuliaHealth/MedEval3D.jl.git"
620
]
7-
821
data_dir = "data/exp_raw"
22+
clone_repositories(repos, data_dir)
23+
```
924
10-
if !isdir(data_dir)
11-
mkdir(data_dir)
12-
end
25+
"""
26+
function clone_repositories(repos::Vector{String}, data_dir::String)
27+
# Ensure the target directory exists
28+
if !isdir(data_dir)
29+
mkdir(data_dir)
30+
end
1331

14-
for repo in repos
15-
repo_name = split(repo, "/")[end]
16-
repo_path = joinpath(data_dir, replace(repo_name, ".git" => ""))
17-
18-
if isdir(repo_path)
19-
println("Skipping $repo, already cloned.")
20-
else
21-
try
22-
LibGit2.clone(repo, repo_path)
23-
println("Successfully cloned $repo")
24-
catch e
25-
println("Failed to clone $repo: $e")
32+
# Iterate through the list of repositories and clone them
33+
for repo in repos
34+
repo_name = split(repo, "/")[end]
35+
repo_path = joinpath(data_dir, replace(repo_name, ".git" => ""))
36+
37+
if isdir(repo_path)
38+
println("Skipping $repo, already cloned.")
39+
else
40+
try
41+
LibGit2.clone(repo, repo_path)
42+
println("Successfully cloned $repo")
43+
catch e
44+
println("Failed to clone $repo: $e")
45+
end
2646
end
2747
end
2848
end

scripts/knowledge.jl

Lines changed: 172 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,29 @@
1-
using LibGit2
2-
using TextAnalysis
3-
using Serialization
1+
"""
2+
normalize_repo_url(input::String) -> String
43
4+
Normalizes a repository URL or name into a full GitHub URL.
5+
6+
# Arguments
7+
- `input::String`: The repository name (e.g., "owner/repo") or a full Git URL.
8+
9+
# Returns
10+
- A normalized GitHub URL ending with `.git`.
11+
12+
# Throws
13+
- An error if the input is not a valid repository name or URL.
14+
15+
# Behavior
16+
- If the input is already a valid Git URL (e.g., "https://github.com/owner/repo.git"), it is returned as-is.
17+
- If the input is a repository name in the format "owner/repo", it is converted to a full GitHub URL.
18+
- If the input is invalid, an error is raised.
19+
20+
# Example
21+
```julia
22+
normalize_repo_url("JuliaLang/julia") # "https://github.com/JuliaLang/julia.git"
23+
normalize_repo_url("https://github.com/JuliaLang/julia") # "https://github.com/JuliaLang/julia.git"
24+
normalize_repo_url("invalid_repo") # Error: Invalid repository URL or name
25+
```
26+
"""
527
function normalize_repo_url(input::String)
628
if occursin(r"^https?://", input)
729
return endswith(input, ".git") ? input : input * ".git"
@@ -12,18 +34,119 @@ function normalize_repo_url(input::String)
1234
end
1335
end
1436

37+
"""
38+
clone_repo(repo_url::String, local_path::String)
39+
40+
Clones a Git repository to a specified local path.
41+
42+
# Arguments
43+
- `repo_url::String`: The URL of the repository to clone.
44+
- `local_path::String`: The local directory where the repository will be cloned.
45+
46+
# Behavior
47+
- If the cloning is successful, the repository is cloned to the specified path.
48+
- Prints progress messages during the cloning process.
49+
50+
# Throws
51+
- An error if the cloning process fails (e.g., invalid URL, network issues, or permission errors).
52+
53+
# Example
54+
```julia
55+
clone_repo("https://github.com/JuliaLang/julia.git", "local_julia_repo")
56+
```
57+
"""
1558
function clone_repo(repo_url::String, local_path::String)
1659
println("Cloning repository from $repo_url to $local_path...")
1760
LibGit2.clone(repo_url, local_path)
1861
println("Repository cloned successfully.")
1962
end
2063

64+
"""
65+
is_textual_file(filename::String) -> Bool
66+
67+
Checks if a file is a textual file based on its extension.
68+
69+
# Arguments
70+
- `filename::String`: The name of the file to check.
71+
72+
# Returns
73+
- `true` if the file has a textual extension (e.g., `.md`, `.jl`, `.txt`, etc.), otherwise `false`.
74+
75+
# Behavior
76+
- The function uses a regular expression to match common textual file extensions.
77+
- The check is case-insensitive.
78+
79+
# Example
80+
```julia
81+
is_textual_file("README.md") # true
82+
is_textual_file("script.jl") # true
83+
is_textual_file("image.png") # false
84+
```
85+
"""
2186
function is_textual_file(filename::String)
2287
textual_extensions = r"\.(md|jl|txt|rst|doc|docx|py|cpp|c|h|java|sh)$"
2388
return occursin(textual_extensions, lowercase(filename))
2489
end
2590

26-
function build_corpus_from_repo(repo_input::String)
91+
"""
92+
process_directory(dir_path::String, documents::Vector{AbstractDocument})
93+
94+
Recursively processes a directory to extract textual files and add them to a document collection.
95+
96+
# Arguments
97+
- `dir_path::String`: The directory to process.
98+
- `documents::Vector{AbstractDocument}`: A vector to store the extracted documents.
99+
100+
# Behavior
101+
- Textual files are identified using the `is_textual_file` function.
102+
- The content of each textual file is read and converted into a `StringDocument` object, which is added to the `documents` vector.
103+
- Subdirectories are explored recursively, skipping hidden directories (those starting with a dot).
104+
105+
# Example
106+
```julia
107+
documents = AbstractDocument[]
108+
process_directory("path/to/repo", documents)
109+
println("Number of documents processed: ", length(documents))
110+
```
111+
"""
112+
function process_directory(dir_path::String, documents::Vector{AbstractDocument})
113+
for entry in readdir(dir_path, join=true)
114+
if isfile(entry) && is_textual_file(entry)
115+
println("Processing file: $entry")
116+
content = read(entry, String)
117+
doc = StringDocument(content)
118+
push!(documents, doc)
119+
elseif isdir(entry) && !occursin(r"^\.", basename(entry))
120+
println("Exploring directory: $entry...")
121+
process_directory(entry, documents) # Recursive call
122+
end
123+
end
124+
end
125+
126+
"""
127+
build_corpus_from_repo(repo_input::String) -> Corpus
128+
129+
Clones a repository (if not already cloned), processes its files, and builds a corpus.
130+
131+
# Arguments
132+
- `repo_input::String`: The repository name (e.g., "owner/repo") or a full Git URL.
133+
134+
# Returns
135+
- A `Corpus` object containing the textual content of the repository.
136+
137+
# Behavior
138+
- The repository URL is normalized using `normalize_repo_url`.
139+
- If the repository is already cloned, it reuses the local copy; otherwise, it clones the repository using `clone_repo`.
140+
- The repository's directory is processed recursively using `process_directory` to extract textual files.
141+
- A `Corpus` object is created from the extracted documents, and its lexicon and index are updated.
142+
143+
# Example
144+
```julia
145+
corpus = build_corpus_from_repo("JuliaLang/julia")
146+
println("Corpus contains ", length(corpus), " documents.")
147+
```
148+
"""
149+
function build_corpus_from_repo(repo_input::String)::Corpus
27150
repo_url = normalize_repo_url(repo_input)
28151
repo_name = split(repo_url, "/")[end] |> x -> replace(x, ".git" => "")
29152
local_path = joinpath(pwd(), "repo_$repo_name")
@@ -35,22 +158,7 @@ function build_corpus_from_repo(repo_input::String)
35158
end
36159

37160
documents = AbstractDocument[]
38-
39-
function process_directory(dir_path)
40-
for entry in readdir(dir_path, join=true)
41-
if isfile(entry) && is_textual_file(entry)
42-
println("Processing file: $entry")
43-
content = read(entry, String)
44-
doc = StringDocument(content)
45-
push!(documents, doc)
46-
elseif isdir(entry) && !occursin(r"^\.", basename(entry))
47-
println("Exploring directory: $entry...")
48-
process_directory(entry)
49-
end
50-
end
51-
end
52-
53-
process_directory(local_path)
161+
process_directory(local_path, documents) # Call the standalone function
54162

55163
corpus = Corpus(documents)
56164
println("Updating corpus lexicon and index...")
@@ -60,50 +168,54 @@ function build_corpus_from_repo(repo_input::String)
60168
return corpus
61169
end
62170

171+
"""
172+
save_corpus(corpus::Corpus, repo_input::String)
173+
174+
Saves a corpus to a file in the current working directory.
175+
176+
# Arguments
177+
- `corpus::Corpus`: The corpus to save.
178+
- `repo_input::String`: The repository name or URL (used to generate the filename).
179+
180+
# Behavior
181+
- The repository name is normalized using `normalize_repo_url` to create a safe filename.
182+
- The corpus is serialized and saved to a `.jls` file in the current working directory.
183+
- Ensures the target directory exists before saving the file.
184+
- Prints progress messages during the saving process.
185+
186+
# Throws
187+
- An error message is printed if the saving process fails (e.g., permission issues or invalid paths).
188+
189+
# Example
190+
```julia
191+
corpus = build_corpus_from_repo("JuliaLang/julia")
192+
save_corpus(corpus, "JuliaLang/julia")
193+
```
194+
"""
63195
function save_corpus(corpus::Corpus, repo_input::String)
64196
current_dir = pwd()
65-
safe_repo_name = split(normalize_repo_url(repo_input), "/")[end] |> x -> replace(x, ".git" => "") |> x -> replace(x, r"[^a-zA-Z0-9_]" => "_")
197+
safe_repo_name = split(normalize_repo_url(repo_input), "/")[end] |>
198+
x -> replace(x, ".git" => "") |>
199+
x -> replace(x, r"[^a-zA-Z0-9_]" => "_")
66200
filename = joinpath(current_dir, "corpus_$(safe_repo_name).jls")
67201

68-
println("Saving corpus to $filename...")
69-
open(filename, "w") do io
70-
serialize(io, corpus)
71-
end
72-
println("Corpus saved successfully.")
73-
end
74-
75-
function main()
76-
repo_input = ""
77-
78-
if isinteractive()
79-
println("Please enter the GitHub repository URL or name (e.g., 'https://github.com/JuliaLang/julia.git' or 'JuliaLang/julia'):")
80-
repo_input = readline()
81-
else
82-
if length(ARGS) > 0
83-
repo_input = ARGS[1]
84-
println("Using repository from command-line argument: $repo_input")
85-
else
86-
repo_input = "https://github.com/JuliaLang/julia.git"
87-
println("No repository specified. Using default: $repo_input")
88-
end
89-
end
90-
91-
if isempty(strip(repo_input))
92-
println("Error: Repository input cannot be empty. Using default 'https://github.com/JuliaLang/julia.git'.")
93-
repo_input = "https://github.com/JuliaLang/julia.git"
94-
else
95-
println("Using repository: $repo_input")
96-
end
97-
98202
try
99-
corpus = build_corpus_from_repo(repo_input)
100-
save_corpus(corpus, repo_input)
101-
return corpus
203+
println("Saving corpus to $filename...")
204+
205+
# Ensure the directory exists
206+
dir_path = dirname(filename)
207+
if !isdir(dir_path)
208+
println("Directory $dir_path does not exist. Creating it...")
209+
mkdir(dir_path)
210+
end
211+
212+
# Save the corpus to the file
213+
open(filename, "w") do io
214+
serialize(io, corpus)
215+
end
216+
217+
println("Corpus saved successfully to $filename.")
102218
catch e
103-
println("Error occurred: $e")
104-
println("Please check the repository URL or name and ensure it is valid.")
105-
return nothing
219+
println("Failed to save corpus: $e")
106220
end
107-
end
108-
109-
corpus = main()
221+
end

0 commit comments

Comments
 (0)