Skip to content

Commit

Permalink
Merge branch 'main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
sam-hey authored Jan 26, 2025
2 parents f14fef6 + 18b5630 commit aebdc2e
Show file tree
Hide file tree
Showing 16,481 changed files with 921,325 additions and 13,285 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
40 changes: 25 additions & 15 deletions load_external.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,14 +68,17 @@ def get_model_parameters_memory(model_info: ModelInfo) -> tuple[int| None, float
return None, None


def get_dim_seq_size(model: ModelInfo) -> tuple[str | None, str | None, int, float]:
def get_dim_seq_size(model: ModelInfo) -> tuple[str | None, str | None, int, float, str | None]:
siblings = model.siblings or []
filenames = [sib.rfilename for sib in siblings]
dim, seq = None, None
similarity_fn_name = None
for filename in filenames:
if re.match(r"\d+_Pooling/config.json", filename):
st_config_path = hf_hub_download(model.id, filename=filename)
dim = json.load(open(st_config_path)).get("word_embedding_dimension", None)
with open(st_config_path) as f:
pooling_config = json.load(f)
dim = pooling_config.get("word_embedding_dimension", None)
break
for filename in filenames:
if re.match(r"\d+_Dense/config.json", filename):
Expand All @@ -87,17 +90,21 @@ def get_dim_seq_size(model: ModelInfo) -> tuple[str | None, str | None, int, flo
if not dim:
dim = config.get("hidden_dim", config.get("hidden_size", config.get("d_model", None)))
seq = config.get("n_positions", config.get("max_position_embeddings", config.get("n_ctx", config.get("seq_length", None))))

if "config_sentence_transformers.json" in filenames:
st_config_path = hf_hub_download(model.id, filename="config_sentence_transformers.json")
with open(st_config_path) as f:
st_config = json.load(f)
similarity_fn_name = st_config.get("similarity_fn_name", None)
parameters, memory = get_model_parameters_memory(model)
return dim, seq, parameters, memory
return dim, seq, parameters, memory, similarity_fn_name


def create_model_meta(model_info: ModelInfo) -> ModelMeta | None:
readme_path = hf_hub_download(model_info.id, filename="README.md", etag_timeout=30)
meta = metadata_load(readme_path)
dim, seq, parameters, memory = None, None, None, None
dim, seq, parameters, memory, similarity_fn_name = None, None, None, None, None
try:
dim, seq, parameters, memory = get_dim_seq_size(model_info)
dim, seq, parameters, memory, similarity_fn_name = get_dim_seq_size(model_info)
except Exception as e:
logger.error(f"Error getting model parameters for {model_info.id}, {e}")

Expand All @@ -110,7 +117,12 @@ def create_model_meta(model_info: ModelInfo) -> ModelMeta | None:
for i in range(len(languages)):
if languages[i] is False:
languages[i] = "no"

datasets = meta.get("datasets", None)
if datasets is not None:
datasets = {
d: []
for d in datasets
}
model_meta = ModelMeta(
name=model_info.id,
revision=model_info.sha,
Expand All @@ -122,6 +134,11 @@ def create_model_meta(model_info: ModelInfo) -> ModelMeta | None:
max_tokens=seq,
n_parameters=parameters,
languages=languages,
public_training_code=None,
public_training_data=None,
similarity_fn_name=similarity_fn_name,
use_instructions=None,
training_datasets=datasets,
)
return model_meta

Expand All @@ -139,14 +156,7 @@ def parse_readme(model_info: ModelInfo) -> dict[str, dict[str, Any]] | None:
return
model_index = meta["model-index"][0]
model_name_from_readme = model_index.get("name", None)
orgs = ["Alibaba-NLP", "HIT-TMG", "McGill-NLP", "Snowflake", "facebook", "jinaai", "nomic-ai"]
is_org = any([model_id.startswith(org) for org in orgs])
# There a lot of reuploads with tunes, quantization, etc. We only want the original model
# to prevent this most of the time we can check if the model name from the readme is the same as the model id
# but some orgs have a different naming in their readme
if model_name_from_readme and not model_info.id.endswith(model_name_from_readme) and not is_org:
logger.warning(f"Model name mismatch: {model_info.id} vs {model_name_from_readme}")
return

results = model_index.get("results", [])
model_results = {}
for result in results:
Expand Down
39,662 changes: 39,316 additions & 346 deletions paths.json

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,19 @@
"languages": [],
"loader": null,
"n_parameters": 135193344,
"memory_usage": null,
"max_tokens": 512,
"max_tokens": 512.0,
"embed_dim": 768,
"license": null,
"open_weights": true,
"public_training_data": null,
"public_training_code": null,
"public_training_data": null,
"framework": [
"Sentence Transformers"
],
"reference": null,
"similarity_fn_name": null,
"use_instructions": null,
"zero_shot_benchmarks": null
"training_datasets": {},
"adapted_from": null,
"superseded_by": null
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,19 @@
"languages": [],
"loader": null,
"n_parameters": 135193344,
"memory_usage": null,
"max_tokens": 512,
"max_tokens": 512.0,
"embed_dim": 768,
"license": null,
"open_weights": true,
"public_training_data": null,
"public_training_code": null,
"public_training_data": null,
"framework": [
"Sentence Transformers"
],
"reference": null,
"similarity_fn_name": null,
"use_instructions": null,
"zero_shot_benchmarks": null
"training_datasets": {},
"adapted_from": null,
"superseded_by": null
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"dataset_revision": "392ba3f5bcc8c51f578786c1fc3dae648662cb9b",
"task_name": "AlloProfClusteringP2P",
"evaluation_time": null,
"mteb_version": null,
"scores": {
"test": [
{
"hf_subset": "fra-Latn",
"languages": [
"fra-Latn"
],
"v_measure": 0.6234594305243399,
"main_score": 0.6234594305243399
}
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"dataset_revision": "392ba3f5bcc8c51f578786c1fc3dae648662cb9b",
"task_name": "AlloProfClusteringS2S",
"evaluation_time": null,
"mteb_version": null,
"scores": {
"test": [
{
"hf_subset": "fra-Latn",
"languages": [
"fra-Latn"
],
"v_measure": 0.2572945498452115,
"main_score": 0.2572945498452115
}
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"dataset_revision": "65393d0d7a08a10b4e348135e824f385d420b0fd",
"task_name": "AlloprofReranking",
"evaluation_time": null,
"mteb_version": null,
"scores": {
"test": [
{
"hf_subset": "fra-Latn",
"languages": [
"fra-Latn"
],
"map": 0.26596323297349184,
"mrr": 0.26091629657044163,
"main_score": 0.26596323297349184
}
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
{
"dataset_revision": "fcf295ea64c750f41fadbaa37b9b861558e1bfbd",
"task_name": "AlloprofRetrieval",
"evaluation_time": null,
"mteb_version": null,
"scores": {
"test": [
{
"hf_subset": "fra-Latn",
"languages": [
"fra-Latn"
],
"map_at_1": 0.00345,
"map_at_10": 0.00934,
"map_at_100": 0.01191,
"map_at_1000": 0.013419999999999998,
"map_at_20": 0.0102,
"map_at_3": 0.006689999999999999,
"map_at_5": 0.00753,
"mrr_at_1": 0.00345,
"mrr_at_10": 0.00934,
"mrr_at_100": 0.01191,
"mrr_at_1000": 0.013419999999999998,
"mrr_at_20": 0.0102,
"mrr_at_3": 0.006689999999999999,
"mrr_at_5": 0.00753,
"ndcg_at_1": 0.00345,
"ndcg_at_10": 0.013839999999999998,
"ndcg_at_100": 0.03151,
"ndcg_at_1000": 0.09014,
"ndcg_at_20": 0.01692,
"ndcg_at_3": 0.00785,
"ndcg_at_5": 0.00941,
"precision_at_1": 0.00345,
"precision_at_10": 0.00289,
"precision_at_100": 0.00124,
"precision_at_1000": 0.00063,
"precision_at_20": 0.00205,
"precision_at_3": 0.00374,
"precision_at_5": 0.00302,
"recall_at_1": 0.00345,
"recall_at_10": 0.02893,
"recall_at_100": 0.12435,
"recall_at_1000": 0.62867,
"recall_at_20": 0.04102,
"recall_at_3": 0.01123,
"recall_at_5": 0.015110000000000002,
"main_score": 0.013839999999999998
}
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"dataset_revision": "1399c76144fd37290681b995c656ef9b2e06e26d",
"task_name": "AmazonReviewsClassification",
"evaluation_time": null,
"mteb_version": null,
"scores": {
"test": [
{
"hf_subset": "fra-Latn",
"languages": [
"None"
],
"accuracy": 0.32661999999999997,
"f1": 0.32443152253731844,
"main_score": 0.32661999999999997
}
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
{
"dataset_revision": "5effa1b9b5fa3b0f9e12523e6e43e5f86a6e6d59",
"task_name": "BSARDRetrieval",
"evaluation_time": null,
"mteb_version": null,
"scores": {
"test": [
{
"hf_subset": "fra-Latn",
"languages": [
"fra-Latn"
],
"map_at_1": 0.0,
"map_at_10": 0.0,
"map_at_100": 0.00062,
"map_at_1000": 0.00077,
"map_at_20": 0.0,
"map_at_3": 0.0,
"map_at_5": 0.0,
"mrr_at_1": 0.0,
"mrr_at_10": 0.0,
"mrr_at_100": 0.00062,
"mrr_at_1000": 0.00077,
"mrr_at_20": 0.0,
"mrr_at_3": 0.0,
"mrr_at_5": 0.0,
"ndcg_at_1": 0.0,
"ndcg_at_10": 0.0,
"ndcg_at_100": 0.00484,
"ndcg_at_1000": 0.01054,
"ndcg_at_20": 0.0,
"ndcg_at_3": 0.0,
"ndcg_at_5": 0.0,
"precision_at_1": 0.0,
"precision_at_10": 0.0,
"precision_at_100": 0.00027,
"precision_at_1000": 8e-05,
"precision_at_20": 0.0,
"precision_at_3": 0.0,
"precision_at_5": 0.0,
"recall_at_1": 0.0,
"recall_at_10": 0.0,
"recall_at_100": 0.02703,
"recall_at_1000": 0.07658,
"recall_at_20": 0.0,
"recall_at_3": 0.0,
"recall_at_5": 0.0,
"main_score": 0.02703
}
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"dataset_revision": "e06ebbbb123f8144bef1a5d18796f3dec9ae2915",
"task_name": "HALClusteringS2S",
"evaluation_time": null,
"mteb_version": null,
"scores": {
"test": [
{
"hf_subset": "fra-Latn",
"languages": [
"fra-Latn"
],
"v_measure": 0.1377084465510841,
"main_score": 0.1377084465510841
}
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"dataset_revision": "b5d54f8f3b61ae17845046286940f03c6bc79bc7",
"task_name": "MLSUMClusteringP2P",
"evaluation_time": null,
"mteb_version": null,
"scores": {
"test": [
{
"hf_subset": "fra-Latn",
"languages": [
"None"
],
"v_measure": 0.4543375637260015,
"main_score": 0.4543375637260015
}
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"dataset_revision": "b5d54f8f3b61ae17845046286940f03c6bc79bc7",
"task_name": "MLSUMClusteringS2S",
"evaluation_time": null,
"mteb_version": null,
"scores": {
"test": [
{
"hf_subset": "fra-Latn",
"languages": [
"None"
],
"v_measure": 0.45205646487969753,
"main_score": 0.45205646487969753
}
]
}
}
Loading

0 comments on commit aebdc2e

Please sign in to comment.