Skip to content

Commit 82835a3

Browse files
committed
the stack v2 is coming
1 parent c81387f commit 82835a3

File tree

7 files changed

+1092
-27
lines changed

7 files changed

+1092
-27
lines changed

poetry.lock

Lines changed: 273 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ seaborn = "^0.12.2"
3737
ruff = "^0.0.265"
3838
insegel = "^1.3.1"
3939
sphinx-autobuild = "^2021.3.14"
40+
google-cloud-storage = "^2.10.0"
4041

4142
[build-system]
4243
requires = ["poetry-core"]

text_dedup/bigcode/ablation_visualize.ipynb

Lines changed: 106 additions & 0 deletions
Large diffs are not rendered by default.

text_dedup/bigcode/dirs.list

Lines changed: 658 additions & 0 deletions
Large diffs are not rendered by default.

text_dedup/bigcode/get_list.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import sys
2+
3+
from google.cloud import storage
4+
5+
bucket = sys.argv[1]
6+
prefix = sys.argv[2]
7+
client = storage.Client()
8+
b = client.get_bucket(bucket)
9+
10+
blobs = b.list_blobs(prefix=prefix)
11+
seen = set()
12+
for blob in blobs:
13+
prefix = blob.name.rsplit("/", 1)[0]
14+
dir = f"gs://{b.name}/{prefix}"
15+
if dir not in seen:
16+
print(dir)
17+
seen.add(dir)

text_dedup/bigcode/intra_dedup.py

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -343,14 +343,19 @@ def area(s):
343343
# region: Quality Control
344344
def process_cluster(cluster: List[Any], enabled: bool = False) -> List[Any]:
345345
if not enabled:
346-
np.random.shuffle(cluster)
346+
RNG.shuffle(cluster)
347347
return cluster[:1]
348348

349349
cluster.sort(
350350
key=lambda x: (
351-
-x[-1] if x[-1] is not None else 0.0, # star_events_count
352-
-x[-2] if x[-2] is not None else 0.0, # fork_events_count
353-
-np.datetime64(x[-3]).astype(np.uint64) if x[-3] is not None else 0.0, # visit_date
351+
# license_type, the more permissive the better
352+
["permissive", "no_license", "non_permissive"].index(x[-1]) if x[-1] is not None else float("inf"),
353+
# star_events_count, the more the better
354+
-x[-2] if x[-2] is not None else 0.0,
355+
# fork_events_count, the more the better
356+
-x[-3] if x[-3] is not None else 0.0,
357+
# visit_date, the earliest the better, tie breaker
358+
np.datetime64(x[-4]).astype(np.uint64) if x[-4] is not None else float("inf"),
354359
)
355360
)
356361
return cluster[:1]
@@ -382,7 +387,7 @@ def partitioned_save(df: DataFrame, chunk_size: int, max_partitions: int, output
382387
"""
383388

384389
total_rows = df.count()
385-
partitions = max(1, min(math.ceil(total_rows / chunk_size), max_partitions))
390+
partitions = max(256, min(math.ceil(total_rows / chunk_size), max_partitions))
386391

387392
def save_partition(df: pd.DataFrame) -> pd.DataFrame: # type: ignore
388393
pid = df["__pid__"].iloc[0]
@@ -392,15 +397,17 @@ def save_partition(df: pd.DataFrame) -> pd.DataFrame: # type: ignore
392397
)
393398
return pd.DataFrame([{"__status__": True, "__pid__": pid}])
394399

400+
log.debug(f"Saving {total_rows} rows to {partitions} partitions.")
401+
395402
results = (
396403
df.repartition(partitions) # random and uniform hash partitioning
397404
.withColumn("__pid__", F.spark_partition_id())
398405
.groupBy("__pid__")
399406
.applyInPandas(save_partition, schema="__status__ boolean, __pid__ int")
400-
.toPandas()
407+
.cache()
401408
)
402409

403-
if results["__status__"].all():
410+
if results.filter(~F.col("__status__")).count() == 0:
404411
pd.DataFrame([]).to_csv(os.path.join(output, "_SUCCESS"), index=False, header=False)
405412
return
406413

@@ -433,6 +440,8 @@ def save_partition(df: pd.DataFrame) -> pd.DataFrame: # type: ignore
433440
conf = SparkConf()
434441
conf.set("spark.app.name", "MinHashLSH")
435442
conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
443+
conf.set("spark.storage.memoryFraction", "1")
444+
conf.set("spark.default.parallelism", "100")
436445
spark = SparkSession.builder.config(conf=conf).getOrCreate() # type: ignore
437446
log: Logger = spark.sparkContext._jvm.org.apache.log4j.LogManager.getLogger(__name__) # type: ignore
438447

@@ -445,8 +454,8 @@ def save_partition(df: pd.DataFrame) -> pd.DataFrame: # type: ignore
445454
if B is None or R is None:
446455
B, R = optimal_param(args.threshold, args.num_perm)
447456

448-
MAX_WRITE_CHUNK_SIZE: int = 1_000_000
449-
MAX_WRITE_PARTITIONS: int = 256
457+
MAX_WRITE_CHUNK_SIZE: int = 80_000
458+
MAX_WRITE_PARTITIONS: int = 2048
450459
HASH_RANGES: List[Tuple[int, int]] = [(i * R, (i + 1) * R) for i in range(B)]
451460
PERMUTATIONS: Tuple[np.ndarray, np.ndarray] = (
452461
RNG.randint(1, MOD_PRIME, size=(args.num_perm,), dtype=DTYPE),
@@ -455,6 +464,7 @@ def save_partition(df: pd.DataFrame) -> pd.DataFrame: # type: ignore
455464

456465
# region: Data Loading
457466
df: DataFrame = spark.read.option("mergeSchema", "true").parquet(args.input)
467+
# df = df.filter(F.col("license_type") == "permissive").cache()
458468
if args.index_column is None:
459469
df = df.withColumn("__id__", F.monotonically_increasing_id()).cache()
460470
else:
@@ -600,11 +610,9 @@ def save_partition(df: pd.DataFrame) -> pd.DataFrame: # type: ignore
600610
"__component__",
601611
args.repo_column,
602612
"visit_date",
613+
"fork_events_count",
603614
"star_events_count",
604-
"fork_events_count"
605-
# "max_stars_repo_stars_event_min_datetime",
606-
# "max_stars_count",
607-
# "max_forks_count",
615+
"license_type",
608616
]
609617
if args.rank
610618
else [

text_dedup/bigcode/run.sh

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
CLUSTER_NAME="chenghao-temp"
77
PROJECT_ID="huggingface-science-codeparrot"
88
REGION="us-central1"
9-
CONTAINER=""
10-
DIRECTORY=""
11-
NUM_WORKERS=18
9+
CONTAINER="gs://the_stack_v2"
10+
DIRECTORY="licensed_files"
11+
NUM_WORKERS=25
1212
MASTER_MACHINE_TYPE="c2d-standard-16"
1313
MASTER_BOOT_DISK_SIZE=1024
1414
WORKER_MACHINE_TYPE="c2-standard-16"
@@ -19,8 +19,9 @@ THRESHOLD=0.7
1919
REPO_COLUMN="repo_url"
2020

2121
DEDUPED_DIRECTORY="${DIRECTORY}_deduped"
22-
DEDUPED_INDEX_DIRECTORY="${DEDUPED_DIRECTORY}_index"
23-
DIRS=$(gsutil ls "${CONTAINER}/${DIRECTORY}")
22+
# DEDUPED_INDEX_DIRECTORY="${DEDUPED_DIRECTORY}_index"
23+
# DIRS=("gs://the_stack_v2/licensed_files/language_id=Python")
24+
DIRS=$(cat dirs.list)
2425

2526
# Create cluster if it doesn't exist
2627
if ! gcloud dataproc clusters list --region $REGION | grep -q $CLUSTER_NAME; then
@@ -49,12 +50,11 @@ i=0
4950

5051
echo "Total number of directories: $TOTAL"
5152
for DIR in $DIRS; do
52-
5353
# Progress bar
5454
echo -n "[ "
5555
curr_pos=$((i * LENGTH / TOTAL))
56-
for ((k = 0 ; k <= curr_pos; k++)); do echo -n "==="; done
57-
for ((j = k + 1; j <= LENGTH ; j++)); do echo -n " "; done
56+
for ((k = 0; k <= curr_pos; k++)); do echo -n "==="; done
57+
for ((j = k + 1; j <= LENGTH; j++)); do echo -n " "; done
5858
v=$(((i + 1) * 100 / TOTAL))
5959
echo -n " ] "
6060
echo "$v %" $'\r'
@@ -64,8 +64,8 @@ for DIR in $DIRS; do
6464
INPUT_GCS_PATH="${DIR}"
6565
LAN=$(echo "$DIR" | rev | cut -d'/' -f1 | rev)
6666
OUTPUT_GCS_PATH="${CONTAINER}/${DEDUPED_DIRECTORY}/${LAN}"
67-
OUTPUT_INDEX_GCS_PATH="${CONTAINER}/${DEDUPED_INDEX_DIRECTORY}/${LAN}"
68-
OUTPUT_STATUS_GCS_PATH="${CONTAINER}/${DEDUPED_INDEX_DIRECTORY}/${LAN}/_SUCCESS"
67+
# OUTPUT_INDEX_GCS_PATH="${CONTAINER}/${DEDUPED_INDEX_DIRECTORY}/${LAN}"
68+
OUTPUT_STATUS_GCS_PATH="${OUTPUT_GCS_PATH}/_SUCCESS"
6969
result=$(gsutil stat "${OUTPUT_STATUS_GCS_PATH}" 2>&1 | grep -c "No URLs matched")
7070
if [[ $result != 1 ]]; then
7171
echo "Skipping ${LAN}"
@@ -82,10 +82,13 @@ for DIR in $DIRS; do
8282
--input "$INPUT_GCS_PATH" \
8383
--output "$OUTPUT_GCS_PATH" \
8484
--threshold $THRESHOLD \
85-
--output_index "$OUTPUT_INDEX_GCS_PATH" \
8685
--repo_column $REPO_COLUMN \
87-
--rank \
88-
--debug
86+
--rank
87+
88+
# --rank
89+
# --debug
90+
# --output_index "$OUTPUT_INDEX_GCS_PATH"
91+
8992
done
9093

9194
gcloud dataproc clusters stop $CLUSTER_NAME --region $REGION

0 commit comments

Comments
 (0)