Skip to content

Commit 080e665

Browse files
gaudybGaudy Blanco
andauthored
parquet reader implementation as a new input type (#2320)
parquet reader impl Co-authored-by: Gaudy Blanco <gaudy-microsoft@MacBook-Pro-m4-Gaudy-For-Work.local>
1 parent ebc8d61 commit 080e665

File tree

8 files changed

+1968
-1855
lines changed

8 files changed

+1968
-1855
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"type": "patch",
3+
"description": "implement parquet reader"
4+
}

packages/graphrag-input/graphrag_input/input_reader_factory.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,10 @@ def create_input_reader(config: InputConfig, storage: Storage) -> InputReader:
8181
from graphrag_input.markitdown import MarkItDownFileReader
8282

8383
register_input_reader(InputType.MarkItDown, MarkItDownFileReader)
84+
case InputType.Parquet:
85+
from graphrag_input.parquet import ParquetFileReader
86+
87+
register_input_reader(InputType.Parquet, ParquetFileReader)
8488
case _:
8589
msg = f"InputConfig.type '{input_strategy}' is not registered in the InputReaderFactory. Registered types: {', '.join(input_reader_factory.keys())}."
8690
raise ValueError(msg)

packages/graphrag-input/graphrag_input/input_type.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ class InputType(StrEnum):
1919
"""The JSON Lines input type."""
2020
MarkItDown = "markitdown"
2121
"""The MarkItDown input type."""
22+
Parquet = "parquet"
23+
"""The Parquet input type."""
2224

2325
def __repr__(self):
2426
"""Get a string representation."""
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# Copyright (c) 2024 Microsoft Corporation.
2+
# Licensed under the MIT License
3+
4+
"""A module containing 'ParquetFileReader' model."""
5+
6+
import io
7+
import logging
8+
9+
import pyarrow.parquet as pq
10+
11+
from graphrag_input.structured_file_reader import StructuredFileReader
12+
from graphrag_input.text_document import TextDocument
13+
14+
logger = logging.getLogger(__name__)
15+
16+
17+
class ParquetFileReader(StructuredFileReader):
18+
"""Reader implementation for parquet files."""
19+
20+
def __init__(self, file_pattern: str | None = None, **kwargs):
21+
super().__init__(
22+
file_pattern=file_pattern if file_pattern is not None else ".*\\.parquet$",
23+
**kwargs,
24+
)
25+
26+
async def read_file(self, path: str) -> list[TextDocument]:
27+
"""Read a parquet file into a list of documents.
28+
29+
Args:
30+
- path - The path to read the file from.
31+
32+
Returns
33+
-------
34+
- output - list with a TextDocument for each row in the file.
35+
"""
36+
file_bytes = await self._storage.get(path, as_bytes=True)
37+
table = pq.read_table(io.BytesIO(file_bytes))
38+
rows = table.to_pylist()
39+
return await self.process_data_columns(rows, path)

packages/graphrag-input/pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@ dependencies = [
3434
"graphrag-storage==3.0.8 ",
3535
"pydantic~=2.10",
3636
"markitdown~=0.1.0",
37-
"markitdown[pdf]"
37+
"markitdown[pdf]",
38+
"pyarrow>=14.0.0"
3839
]
3940

4041
[project.urls]
912 Bytes
Binary file not shown.
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# Copyright (c) 2024 Microsoft Corporation.
2+
# Licensed under the MIT License
3+
4+
from graphrag_input import InputConfig, InputType, create_input_reader
5+
from graphrag_storage import StorageConfig, create_storage
6+
7+
8+
async def test_parquet_loader_one_file():
9+
config = InputConfig(
10+
type=InputType.Parquet,
11+
file_pattern=".*\\.parquet$",
12+
)
13+
storage = create_storage(
14+
StorageConfig(
15+
base_dir="tests/unit/indexing/input/data/one-parquet",
16+
)
17+
)
18+
reader = create_input_reader(config, storage)
19+
documents = await reader.read_files()
20+
assert len(documents) == 2
21+
assert documents[0].title == "input.parquet (0)"
22+
assert documents[0].raw_data == {
23+
"title": "Hello",
24+
"text": "Hi how are you today?",
25+
}
26+
assert documents[1].title == "input.parquet (1)"
27+
28+
29+
async def test_parquet_loader_one_file_with_title():
30+
config = InputConfig(
31+
type=InputType.Parquet,
32+
title_column="title",
33+
)
34+
storage = create_storage(
35+
StorageConfig(
36+
base_dir="tests/unit/indexing/input/data/one-parquet",
37+
)
38+
)
39+
reader = create_input_reader(config, storage)
40+
documents = await reader.read_files()
41+
assert len(documents) == 2
42+
assert documents[0].title == "Hello"
43+
assert documents[1].title == "World"
44+
45+
46+
async def test_parquet_loader_text_content():
47+
config = InputConfig(
48+
type=InputType.Parquet,
49+
text_column="text",
50+
title_column="title",
51+
)
52+
storage = create_storage(
53+
StorageConfig(
54+
base_dir="tests/unit/indexing/input/data/one-parquet",
55+
)
56+
)
57+
reader = create_input_reader(config, storage)
58+
documents = await reader.read_files()
59+
assert len(documents) == 2
60+
assert documents[0].text == "Hi how are you today?"
61+
assert documents[1].text == "This is a test."

uv.lock

Lines changed: 1856 additions & 1854 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)