LICENSE
README.md
pyproject.toml
data_prep/arxiv/arxiv_cleaner.py
data_prep/arxiv/run_clean.py
data_prep/arxiv/run_download.py
data_prep/arxiv/token_count.py
data_prep/arxiv/utils.py
data_prep/book/__init__.py
data_prep/book/dedup.py
data_prep/book/download.py
data_prep/book/token_count.py
data_prep/c4/__init__.py
data_prep/c4/c4_reformat.py
data_prep/cc/cc_net/setup.py
data_prep/cc/cc_net/cc_net/__init__.py
data_prep/cc/cc_net/cc_net/__main__.py
data_prep/cc/cc_net/cc_net/dedup.py
data_prep/cc/cc_net/cc_net/execution.py
data_prep/cc/cc_net/cc_net/flat_hash_set.py
data_prep/cc/cc_net/cc_net/get_wiki_cirrus.py
data_prep/cc/cc_net/cc_net/jsonql.py
data_prep/cc/cc_net/cc_net/mine.py
data_prep/cc/cc_net/cc_net/minify.py
data_prep/cc/cc_net/cc_net/perplexity.py
data_prep/cc/cc_net/cc_net/process_wet_file.py
data_prep/cc/cc_net/cc_net/regroup.py
data_prep/cc/cc_net/cc_net/split_by_lang.py
data_prep/cc/cc_net/cc_net/text_normalizer.py
data_prep/cc/cc_net/cc_net/tokenizer.py
data_prep/cc/cc_net/cc_net/tools/__init__.py
data_prep/cc/cc_net/cc_net/tools/dl_cc_100.py
data_prep/cc/cc_net/cc_net/tools/expand_corpus.py
data_prep/cc/cc_net/cc_net/tools/make_dmoz_corpus.py
data_prep/cc/cc_net/tests/__init__.py
data_prep/cc/cc_net/tests/conftest.py
data_prep/cc/cc_net/tests/test_dedup.py
data_prep/cc/cc_net/tests/test_flat_hash_set.py
data_prep/cc/cc_net/tests/test_jsonql.py
data_prep/cc/cc_net/tests/test_minify.py
data_prep/cc/cc_net/tests/test_normalizer.py
data_prep/cc/cc_net/tests/test_parse_wet_file.py
data_prep/cc/cc_net/tests/test_regroup.py
data_prep/cc/cc_net/tests/test_transformer.py
data_prep/cc/classifier/classify.py
data_prep/cc/classifier/create_corpus.py
data_prep/cc/dedup/dedup_phase1.py
data_prep/cc/dedup/dedup_phase2.py
data_prep/github/__init__.py
data_prep/github/github_clean_dedup_local.py
data_prep/github/github_global_dedup.py
data_prep/github/github_merge_dedup.py
data_prep/github/github_run_filter.py
data_prep/github/github_token_count.py
data_prep/redpajama_data.egg-info/PKG-INFO
data_prep/redpajama_data.egg-info/SOURCES.txt
data_prep/redpajama_data.egg-info/dependency_links.txt
data_prep/redpajama_data.egg-info/top_level.txt
data_prep/stack_exchange/__init__.py
data_prep/stack_exchange/count.py
data_prep/stack_exchange/download.py
data_prep/stack_exchange/filter.py
data_prep/stack_exchange/post_processing.py
data_prep/stack_exchange/print_stats.py
data_prep/stack_exchange/token_count.py
data_prep/wikipedia/__init__.py
data_prep/wikipedia/convert_format.py
data_prep/wikipedia/download.py
data_prep/wikipedia/token_count.py