You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
99 lines
3.3 KiB
99 lines
3.3 KiB
.PHONY: style quality install tensorboard clean fix update-worklog test data download-musique prepare-musique-jsonl extract-musique-paragraphs build-musique-index prepare-musique-index prepare-all-musique check-data
|
|
|
|
# make sure to test the local checkout in scripts and not the pre-installed one
|
|
export PYTHONPATH = src
|
|
|
|
check_dirs := . src notebooks scripts tests
|
|
|
|
# Run tests
|
|
test:
|
|
pytest -v tests/
|
|
|
|
# Development dependencies
|
|
install:
|
|
pip install -e . && pip install -e third_party/FlashRAG
|
|
|
|
# Code quality and style
|
|
style:
|
|
ruff format --line-length 119 --target-version py311 $(check_dirs)
|
|
isort $(check_dirs)
|
|
|
|
quality:
|
|
ruff check --line-length 119 --target-version py311 $(check_dirs)
|
|
isort --check-only $(check_dirs)
|
|
flake8 --max-line-length 119 $(check_dirs)
|
|
|
|
# Auto-fix issues
|
|
fix:
|
|
ruff check --fix --line-length 119 --target-version py311 $(check_dirs)
|
|
isort $(check_dirs)
|
|
|
|
# TensorBoard
|
|
tensorboard:
|
|
tensorboard --logdir=trainer_output_*_runs --port=6006
|
|
|
|
# List available run directories
|
|
list-runs:
|
|
@echo "Available run directories:"
|
|
@ls -d trainer_output_*_runs 2>/dev/null || echo "No run directories found"
|
|
|
|
# Data Preparation
|
|
data: prepare-musique-jsonl
|
|
@echo "Data preparation complete."
|
|
|
|
# Index Preparation
|
|
prepare-musique-index: build-musique-index
|
|
@echo "Musique index preparation complete."
|
|
|
|
download-musique:
|
|
@echo "Downloading Musique dataset..."
|
|
bash scripts/train_data/download_data_musique.sh
|
|
@echo "Musique dataset ready in ./data/raw/"
|
|
|
|
prepare-musique-jsonl: download-musique
|
|
@echo "Preparing Musique data (JSONL)..."
|
|
python scripts/train_data/prepare_musique_jsonl.py
|
|
@echo "Processed Musique JSONL ready in ./data/processed/questions.jsonl"
|
|
|
|
extract-musique-paragraphs: download-musique
|
|
@echo "Extracting unique paragraphs from raw Musique data..."
|
|
python scripts/train_data/extract_musique_paragraphs.py
|
|
@echo "Musique paragraphs extracted to ./data/processed/paragraphs.csv"
|
|
|
|
build-musique-index: extract-musique-paragraphs
|
|
@echo "Building Musique FAISS index from paragraphs..."
|
|
python scripts/train_data/build_musique_index.py
|
|
@echo "Musique FAISS index files saved to ./data/processed/"
|
|
|
|
# Combined Preparation
|
|
prepare-all-musique: data prepare-musique-index
|
|
@echo "All Musique data and index preparation complete."
|
|
|
|
# Check Data
|
|
check-data: prepare-all-musique
|
|
@echo "Checking generated data files..."
|
|
python scripts/check_data.py
|
|
|
|
# Clean up
|
|
clean:
|
|
find . -type d -name "__pycache__" -exec rm -r {} +
|
|
find . -type f -name "*.pyc" -delete
|
|
find . -type f -name "*.pyo" -delete
|
|
find . -type f -name "*.pyd" -delete
|
|
find . -type f -name ".coverage" -delete
|
|
find . -type d -name "*.egg-info" -exec rm -r {} +
|
|
find . -type d -name "*.egg" -exec rm -r {} +
|
|
find . -type d -name ".pytest_cache" -exec rm -r {} +
|
|
find . -type d -name ".ruff_cache" -exec rm -r {} +
|
|
find . -type d -name ".coverage" -exec rm -r {} +
|
|
find . -type d -name "htmlcov" -exec rm -r {} +
|
|
find . -type d -name "build" -exec rm -r {} +
|
|
find . -type d -name "dist" -exec rm -r {} +
|
|
rm -rf ./data/raw ./data/processed # Clean raw and processed data
|
|
# Clean up the old faiss_index directory if it exists
|
|
rm -rf ./data/processed/faiss_index
|
|
|
|
# Update worklog in GitHub issue
|
|
update-worklog:
|
|
gh api -X PATCH /repos/menloresearch/DeepSearch/issues/comments/2743047160 \
|
|
-f body="$$(cat docs/00_worklog.md)" | cat && kill -9 $$PPID
|