Results for Local LLM Models

The below captures the benchmark performance of the local models. Most of these were run through Ollama.ai on a consumer-grade laptop.

Please note that the below models vary in their "open-source-ness" (what has been actually released) and their licencing terms (what they can be used for). Be careful - some of the below models are for research purposes only (eg, Microsoft Phi).

Reminder: The below scores are on a scale 0-100, where 100 is the best possible score and 0 means the generated code was not even parseable.

# Imports
using JuliaLLMLeaderboard
using CairoMakie, AlgebraOfGraphics
using MarkdownTables, DataFramesMeta
using Statistics: mean, median, quantile, std;
unscrub_string(s::AbstractString) = split(s, "_") .|> titlecase |> x -> join(x, " ");

# ! Configuration
SAVE_PLOTS = false
DIR_RESULTS = joinpath(pkgdir(JuliaLLMLeaderboard), "code_generation")
PAID_MODELS_DEFAULT = [
    "gpt-3.5-turbo",
    "gpt-3.5-turbo-1106",
    "gpt-3.5-turbo-0125",
    "gpt-4-1106-preview",
    "gpt-4-0125-preview",
    "gpt-4-turbo-2024-04-09",
    "gpt-4o-2024-05-13",
    "gpt-4o-mini-2024-07-18",
    "gpt-4o-2024-08-06",
    "mistral-tiny",
    "mistral-small",
    "mistral-medium",
    "mistral-large",
    "mistral-small-2402",
    "mistral-medium-2312",
    "mistral-large-2402",
    "claude-3-opus-20240229",
    "claude-3-sonnet-20240229",
    "claude-3-haiku-20240307",
    "claude-3-5-sonnet-20240620",
    "claude-2.1",
    "gemini-1.0-pro-latest",
    "deepseek-chat",
    "deepseek-coder",
    "codestral-2405",
    "mistral-large-2407"
];
MODEL_SIZES = Dict("orca2:13b" => "10-29",
    "mistral:7b-instruct-v0.2-q4_0" => "4-9",
    "nous-hermes2:34b-yi-q4_K_M" => "30-69",
    "starling-lm:latest" => "4-9",
    "dolphin-phi:2.7b-v2.6-q6_K" => "<4",
    "stablelm-zephyr" => "<4",
    "codellama:13b-python" => "10-29",
    "magicoder:7b-s-cl-q6_K" => "4-9",
    "phi:2.7b-chat-v2-q6_K" => "<4",
    "magicoder" => "4-9",
    "mistral:7b-instruct-q4_K_M" => "4-9",
    "solar:10.7b-instruct-v1-q4_K_M" => "10-29",
    "codellama:13b-instruct" => "10-29",
    "openhermes2.5-mistral" => "4-9",
    "llama2" => "4-9",
    "yi:34b-chat" => "30-69",
    "deepseek-coder:33b-instruct-q4_K_M" => "30-69",
    "phind-codellama:34b-v2" => "30-69",
    "openchat:7b-v3.5-1210-q4_K_M" => "4-9",
    "mistral:7b-instruct-v0.2-q6_K" => "4-9",
    "mistral:7b-instruct-v0.2-q4_K_M" => "4-9",
    "codellama:13b-instruct-q4_K_M" => "10-29",
    "codellama:7b-instruct-q4_K_M" => "4-9",
    "codellama:34b-instruct-q4_K_M" => "30-69",
    "codellama:70b-instruct-q2_K" => ">70",
    "codellama:70b-instruct-q4_K_M" => ">70",
    "qwen:72b-chat-v1.5-q4_K_M" => ">70",
    "qwen:72b-chat-v1.5-q2_K" => ">70",
    "qwen:14b-chat-v1.5-q6_K" => "10-29",
    "qwen:14b-chat-v1.5-q4_K_M" => "10-29",
    "qwen:7b-chat-v1.5-q6_K" => "4-9",
    "qwen:7b-chat-v1.5-q4_K_M" => "4-9",
    "qwen:4b-chat-v1.5-q6_K" => "4-9",
    "gemma:7b-instruct-q6_K" => "4-9",
    "accounts/fireworks/models/dbrx-instruct" => ">70",
    "accounts/fireworks/models/mixtral-8x22b-instruct-preview" => ">70",
    "accounts/fireworks/models/qwen-72b-chat" => ">70",
    "meta-llama/Llama-3-8b-chat-hf" => "4-9",
    "meta-llama/Llama-3-70b-chat-hf" => ">70",
    "microsoft/WizardLM-2-8x22B" => ">70",
    "mistralai/Mixtral-8x22B-Instruct-v0.1" => ">70"
)
PROMPTS = [
    "JuliaExpertCoTTask",
    "JuliaExpertAsk",
    "InJulia",
    "JuliaRecapTask",
    "JuliaRecapCoTTask"
];
# Clean up fireworks names
function model_clean(model::AbstractString)
    model = occursin("fireworks", model) ?
            replace(model, "accounts/fireworks/models/" => "") * ("(Fireworks.ai)") : model
    model = occursin("meta-llama/", model) ?
            replace(model, "meta-llama/" => "") * ("(Together.ai)") : model
    model = occursin("mistralai/", model) ?
            replace(model, "mistralai/" => "") * ("(Together.ai)") : model
    model = occursin("microsoft/", model) ?
            replace(model, "microsoft/" => "") * ("(Together.ai)") : model
end
;

Load Results

Use only the 5 most recent evaluations available for each definition/model/prompt

df = @chain begin
    load_evals(DIR_RESULTS; max_history = 5)
    @rsubset !any(startswith.(:model, PAID_MODELS_DEFAULT)) && :prompt_label in PROMPTS
    # remove qwen models as they are not correct! But allow the accounts/fireworks models
    @rsubset !occursin("qwen", :model) || occursin("accounts", :model)
end;

Model Comparison

Highest average score by model:

fig = @chain df begin
    @by [:model] begin
        :cost = mean(:cost)
        :elapsed = mean(:elapsed_seconds)
        :score = mean(:score)
    end
    transform(_, names(_, Number) .=> ByRow(x -> round(x, digits = 1)), renamecols = false)
    @orderby -:score
    @rtransform :model_clean = model_clean(:model)
    @rtransform :size_group = MODEL_SIZES[:model]
    @aside local size_order = ["<4", "4-9", "10-29", "30-69", ">70"]
    @aside local order_ = _.model_clean
    data(_) *
    mapping(:model_clean => sorter(order_) => "Model",
        :score => "Avg. Score (Max 100 pts)",
        color = :size_group => sorter(size_order) => "Parameter Size (Bn)") *
    visual(BarPlot; bar_labels = :y, label_offset = 0, label_rotation = 1)
    draw(;
        figure = (; size = (900, 600)),
        legend = (; position = :bottom),
        axis = (;
            xautolimitmargin = (0.1, 0.05),
            limits = (nothing, nothing, 0, 100),
            xticklabelrotation = 45,
            title = "Open-Source LLM Model Performance"))
end
SAVE_PLOTS && save("assets/model-comparison-local.png", fig)
fig

Table:

output = @chain df begin
    @by [:model] begin
        :elapsed = mean(:elapsed_seconds)
        :elapsed_median = median(:elapsed_seconds)
        :score = mean(:score)
        :score_median = median(:score)
        :score_std_deviation = std(:score)
        :count_zero_score = count(iszero, :score)
        :count_full_score = count(==(100), :score)
    end
    transform(_, names(_, Number) .=> ByRow(x -> round(x, digits = 1)), renamecols = false)
    @orderby -:score
    @rtransform :model = model_clean(:model)
    rename(_, names(_) .|> unscrub_string)
end
# markdown_table(output, String) |> clipboard
markdown_table(output)

Model	Elapsed	Elapsed Median	Score	Score Median	Score Std Deviation	Count Zero Score	Count Full Score
Mixtral-8x22B-Instruct-v0.1(Together.ai)	14.1	11.0	77.6	90.0	25.8	5.0	151.0
Llama-3-70b-chat-hf(Together.ai)	4.3	4.1	76.8	88.3	25.2	0.0	160.0
Llama-3-8b-chat-hf(Together.ai)	1.5	1.4	67.7	66.7	26.4	5.0	70.0
WizardLM-2-8x22B(Together.ai)	34.7	31.0	62.7	60.0	33.8	33.0	118.0
phind-codellama:34b-v2	37.1	36.4	61.8	62.5	33.5	36.0	58.0
magicoder:7b-s-cl-q6_K	15.6	15.8	59.9	60.0	29.9	18.0	35.0
codellama:13b-instruct-q4KM	3.2	3.0	56.4	54.6	33.0	56.0	61.0
magicoder	12.8	10.7	53.7	50.0	33.2	49.0	52.0
nous-hermes2:34b-yi-q4KM	56.8	52.8	50.7	50.0	34.7	78.0	56.0
dbrx-instruct(Fireworks.ai)	3.7	3.6	50.0	50.0	41.2	121.0	75.0
codellama:13b-instruct	18.1	16.7	50.0	50.0	34.4	65.0	44.0
openchat:7b-v3.5-1210-q4KM	14.4	13.7	49.4	50.0	30.3	48.0	23.0
openhermes2.5-mistral	12.9	12.2	48.9	50.0	31.3	55.0	27.0
starling-lm:latest	13.7	12.5	48.4	50.0	30.2	58.0	26.0
codellama:7b-instruct-q4KM	2.1	2.0	47.8	50.0	35.3	95.0	38.0
qwen-72b-chat(Fireworks.ai)	3.2	3.8	45.9	50.0	38.8	117.0	63.0
yi:34b-chat	43.9	41.3	45.6	50.0	30.5	45.0	34.0
mistral:7b-instruct-v0.2-q6_K	21.7	20.9	45.4	50.0	31.3	44.0	23.0
mistral:7b-instruct-v0.2-q4_0	12.4	12.3	44.3	50.0	30.6	75.0	32.0
mistral:7b-instruct-v0.2-q4KM	15.6	15.1	42.6	50.0	28.6	71.0	23.0
codellama:34b-instruct-q4KM	7.5	6.8	39.7	50.0	36.1	127.0	35.0
codellama:70b-instruct-q4KM	16.3	13.8	36.4	0.0	41.2	179.0	58.0
solar:10.7b-instruct-v1-q4KM	18.8	17.7	35.2	50.0	31.1	107.0	10.0
mistral:7b-instruct-q4KM	13.9	13.0	34.8	50.0	26.5	80.0	0.0
codellama:70b-instruct-q2_K	11.2	9.4	29.8	0.0	37.7	198.0	29.0
llama2	17.1	16.3	26.5	25.0	26.5	131.0	0.0
gemma:7b-instruct-q6_K	20.9	22.1	25.9	25.0	25.2	147.0	2.0
orca2:13b	20.1	18.3	23.1	0.0	30.6	166.0	11.0
stablelm-zephyr	9.9	7.7	15.4	0.0	23.5	192.0	1.0
dolphin-phi:2.7b-v2.6-q6_K	8.9	8.4	14.9	0.0	22.9	188.0	0.0
codellama:13b-python	12.5	10.7	12.8	0.0	22.1	155.0	0.0
phi:2.7b-chat-v2-q6_K	13.0	11.6	8.9	0.0	19.4	222.0	0.0

Note that our sample size is low, so the rankings could easily change (we have high standard deviations of the estimated means). That the results only as indicative.

Overview by Prompt Template

Bar chart with all local models and various prompt templates

fig = @chain df begin
    @by [:model, :prompt_label] begin
        :cost = mean(:cost)
        :elapsed = mean(:elapsed_seconds)
        :score = mean(:score)
        :score_median = median(:score)
        :cnt = $nrow
    end
    @rtransform :model = model_clean(:model)
    @aside local average_ = @by(_, :model, :avg=mean(:score)) |>
                            x -> @orderby(x, -:avg).model
    data(_) *
    mapping(:model => sorter(average_) => "Model",
        :score => "Avg. Score (Max 100 pts)",
        color = :prompt_label => "Prompts",
        dodge = :prompt_label) * visual(BarPlot)
    draw(; figure = (size = (900, 600),),
        axis = (xautolimitmargin = (0.1, 0.05), xticklabelrotation = 45,
            title = "Comparison for Local Models"),
        legend = (; position = :bottom))
end
SAVE_PLOTS && save("assets/model-prompt-comparison-local.png", fig)
fig

Table:

output = @chain df begin
    @by [:model, :prompt_label] begin
        :cost = mean(:cost)
        :elapsed = mean(:elapsed_seconds)
        :score = mean(:score)
    end
    @aside average_ = @by _ :model :AverageScore=mean(:score) |> x -> round(x, digits = 1)
    unstack(:model, :prompt_label, :score; fill = 0.0)
    transform(_, names(_, Number) .=> ByRow(x -> round(x, digits = 1)), renamecols = false)
    leftjoin(average_, on = :model)
    @orderby -:AverageScore
end
# markdown_table(output, String) |> clipboard
markdown_table(output)

model	InJulia	JuliaExpertAsk	JuliaExpertCoTTask	JuliaRecapCoTTask	JuliaRecapTask	AverageScore
mistralai/Mixtral-8x22B-Instruct-v0.1	77.8	79.9	73.1	82.5	74.8	77.6
meta-llama/Llama-3-70b-chat-hf	75.5	82.9	75.9	76.7	73.2	76.8
meta-llama/Llama-3-8b-chat-hf	65.5	71.8	68.0	65.7	67.6	67.7
microsoft/WizardLM-2-8x22B	52.8	67.1	70.8	63.5	59.1	62.7
phind-codellama:34b-v2	59.0	68.1	60.1	59.8	62.0	61.8
magicoder:7b-s-cl-q6_K	62.2	60.1	52.8	59.4	65.2	59.9
codellama:13b-instruct-q4KM	67.6	63.4	42.8	52.5	55.8	56.4
magicoder	60.8	50.2	44.7	56.6	56.3	53.7
nous-hermes2:34b-yi-q4KM	61.8	37.4	51.4	46.1	56.6	50.7
accounts/fireworks/models/dbrx-instruct	56.3	64.3	47.3	40.8	41.6	50.0
codellama:13b-instruct	53.2	51.1	44.5	48.5	53.0	50.0
openchat:7b-v3.5-1210-q4KM	50.9	51.0	43.1	49.1	52.9	49.4
openhermes2.5-mistral	49.6	50.7	51.9	40.9	51.3	48.9
starling-lm:latest	51.1	55.5	36.8	46.1	52.6	48.4
codellama:7b-instruct-q4KM	57.7	33.1	32.6	55.4	60.4	47.8
accounts/fireworks/models/qwen-72b-chat	59.7	51.4	49.2	27.2	42.2	45.9
yi:34b-chat	44.5	52.7	39.2	44.0	47.6	45.6
mistral:7b-instruct-v0.2-q6_K	43.2	39.8	47.5	48.2	48.6	45.4
mistral:7b-instruct-v0.2-q4_0	47.4	40.3	42.8	43.8	47.3	44.3
mistral:7b-instruct-v0.2-q4KM	41.8	48.4	40.5	38.5	44.0	42.6
codellama:34b-instruct-q4KM	50.1	53.0	34.8	29.0	31.6	39.7
codellama:70b-instruct-q4KM	43.9	29.9	30.8	35.2	42.4	36.4
solar:10.7b-instruct-v1-q4KM	43.9	36.9	19.7	36.7	38.9	35.2
mistral:7b-instruct-q4KM	37.3	37.3	35.0	31.5	33.0	34.8
codellama:70b-instruct-q2_K	34.6	21.3	25.5	30.0	37.4	29.8
llama2	26.4	31.4	27.2	25.0	22.4	26.5
gemma:7b-instruct-q6_K	22.6	25.3	19.9	34.9	26.9	25.9
orca2:13b	31.3	16.5	24.2	21.5	21.9	23.1
stablelm-zephyr	13.3	15.6	12.2	16.5	19.2	15.4
dolphin-phi:2.7b-v2.6-q6_K	15.6	16.1	12.2	14.1	16.3	14.9
codellama:13b-python	11.0	13.3	13.3	12.4	13.9	12.8
phi:2.7b-chat-v2-q6_K	6.0	8.1	9.9	9.9	10.8	8.9

Other Considerations

Comparison of Time-to-generate vs Average Score Removed any HOSTED model (that's why you don't see some models that are in other plots).

fig = @chain df begin
    @rsubset !occursin("HOSTED", :device)
    @aside local xlims = quantile(df.elapsed_seconds, [0.01, 0.99])
    @by [:model, :prompt_label] begin
        :elapsed = mean(:elapsed_seconds)
        :elapsed_median = median(:elapsed_seconds)
        :score = mean(:score)
        :score_median = median(:score)
        :cnt = $nrow
    end
    data(_) * mapping(:elapsed => "Avg. Elapsed Time (s)",
        :score => "Avg. Score (Max 100 pts)",
        color = :model => "Model")
    draw(; figure = (size = (800, 900),),
        axis = (xautolimitmargin = (0.1, 0.05), xticklabelrotation = 45,
            title = "Elapsed Time vs Score for Local Models",
            limits = (xlims..., nothing, nothing)),
        palettes = (; color = Makie.ColorSchemes.tab20.colors))
end
SAVE_PLOTS && save("assets/elapsed-vs-score-scatter-local.png", fig)
fig

Table:

Point per second is the average score divided by the average elapsed time

output = @chain df begin
    @rsubset !occursin("HOSTED", :device)
    @by [:model, :prompt_label] begin
        :elapsed = mean(:elapsed_seconds)
        :elapsed_median = median(:elapsed_seconds)
        :score_avg = mean(:score)
        :score_median = median(:score)
        :cnt = $nrow
    end
    @rtransform :point_per_second = :score_avg / :elapsed
    @orderby -:point_per_second
    #
    transform(_,
        names(_, Not(:model, :prompt_label)) .=> ByRow(x -> round(x, digits = 1)),
        renamecols = false)
    rename(_, names(_) .|> unscrub_string)
end
# markdown_table(output, String) |> clipboard
markdown_table(output)

Model	Prompt Label	Elapsed	Elapsed Median	Score Avg	Score Median	Cnt	Point Per Second
codellama:13b-instruct-q4KM	JuliaExpertAsk	2.0	1.9	63.4	75.0	70.0	32.1
codellama:7b-instruct-q4KM	InJulia	2.0	2.0	57.7	55.0	70.0	29.1
codellama:7b-instruct-q4KM	JuliaExpertAsk	1.2	0.9	33.1	0.0	70.0	26.5
codellama:7b-instruct-q4KM	JuliaRecapTask	2.6	2.5	60.4	60.0	70.0	23.3
codellama:7b-instruct-q4KM	JuliaExpertCoTTask	1.6	1.4	32.6	0.0	70.0	20.6
codellama:13b-instruct-q4KM	InJulia	3.6	3.4	67.6	61.2	70.0	18.7
codellama:7b-instruct-q4KM	JuliaRecapCoTTask	3.0	2.7	55.4	50.0	70.0	18.3
codellama:13b-instruct-q4KM	JuliaExpertCoTTask	2.7	2.3	42.8	50.0	70.0	15.9
codellama:13b-instruct-q4KM	JuliaRecapTask	3.9	3.6	55.8	50.0	70.0	14.2
codellama:13b-instruct-q4KM	JuliaRecapCoTTask	3.9	4.1	52.5	50.0	70.0	13.5
codellama:34b-instruct-q4KM	JuliaExpertAsk	6.3	5.8	53.0	50.0	70.0	8.4
mistral:7b-instruct-v0.2-q4_0	JuliaExpertAsk	5.8	5.6	40.3	50.0	70.0	6.9
openchat:7b-v3.5-1210-q4KM	JuliaExpertAsk	7.8	6.9	51.0	50.0	55.0	6.5
codellama:34b-instruct-q4KM	InJulia	8.1	7.9	50.1	50.0	70.0	6.2
mistral:7b-instruct-v0.2-q4KM	JuliaExpertAsk	8.1	6.9	48.4	50.0	70.0	6.0
openhermes2.5-mistral	JuliaExpertAsk	8.7	8.9	50.7	52.5	58.0	5.8
starling-lm:latest	JuliaExpertAsk	9.9	9.8	55.5	50.0	58.0	5.6
magicoder	InJulia	11.0	9.6	60.8	60.0	57.0	5.6
codellama:34b-instruct-q4KM	JuliaExpertCoTTask	6.4	6.4	34.8	25.0	70.0	5.4
magicoder	JuliaExpertAsk	9.8	8.7	50.2	50.0	58.0	5.1
codellama:13b-instruct	JuliaExpertAsk	10.4	8.5	51.1	50.0	58.0	4.9
mistral:7b-instruct-q4KM	JuliaExpertAsk	7.7	7.4	37.3	50.0	57.0	4.9
openhermes2.5-mistral	InJulia	10.8	9.8	49.6	50.0	58.0	4.6
starling-lm:latest	InJulia	11.1	11.1	51.1	50.0	58.0	4.6
openchat:7b-v3.5-1210-q4KM	InJulia	11.9	11.7	50.9	50.0	55.0	4.3
magicoder:7b-s-cl-q6_K	InJulia	14.6	15.3	62.2	55.0	42.0	4.3
magicoder	JuliaRecapCoTTask	13.4	12.0	56.6	50.0	57.0	4.2
magicoder	JuliaRecapTask	13.4	10.9	56.3	50.0	57.0	4.2
codellama:34b-instruct-q4KM	JuliaRecapTask	7.6	6.2	31.6	0.0	70.0	4.2
magicoder:7b-s-cl-q6_K	JuliaExpertAsk	14.6	14.7	60.1	58.1	42.0	4.1
magicoder:7b-s-cl-q6_K	JuliaRecapTask	16.1	16.5	65.2	60.0	42.0	4.1
mistral:7b-instruct-v0.2-q6_K	JuliaExpertAsk	10.0	8.2	39.8	50.0	42.0	4.0
mistral:7b-instruct-v0.2-q4_0	InJulia	12.1	11.3	47.4	50.0	70.0	3.9
mistral:7b-instruct-q4KM	InJulia	10.3	10.0	37.3	50.0	57.0	3.6
magicoder:7b-s-cl-q6_K	JuliaRecapCoTTask	16.9	16.4	59.4	55.6	42.0	3.5
openhermes2.5-mistral	JuliaRecapTask	15.0	14.7	51.3	50.0	58.0	3.4
magicoder:7b-s-cl-q6_K	JuliaExpertCoTTask	16.0	16.5	52.8	58.1	42.0	3.3
starling-lm:latest	JuliaRecapTask	16.0	13.6	52.6	50.0	58.0	3.3
codellama:34b-instruct-q4KM	JuliaRecapCoTTask	8.8	7.4	29.0	12.5	70.0	3.3
codellama:13b-instruct	InJulia	16.4	14.7	53.2	50.0	58.0	3.2
mistral:7b-instruct-v0.2-q4_0	JuliaExpertCoTTask	13.2	12.9	42.8	50.0	70.0	3.2
codellama:70b-instruct-q2_K	JuliaRecapTask	11.7	9.5	37.4	25.0	70.0	3.2
llama2	JuliaExpertAsk	9.8	9.1	31.4	50.0	59.0	3.2
openhermes2.5-mistral	JuliaExpertCoTTask	16.6	16.0	51.9	50.0	57.0	3.1
starling-lm:latest	JuliaRecapCoTTask	14.8	13.2	46.1	50.0	58.0	3.1
openhermes2.5-mistral	JuliaRecapCoTTask	13.3	13.5	40.9	50.0	58.0	3.1
openchat:7b-v3.5-1210-q4KM	JuliaRecapTask	17.2	15.8	52.9	50.0	55.0	3.1
mistral:7b-instruct-v0.2-q4KM	InJulia	14.1	13.9	41.8	50.0	70.0	3.0
mistral:7b-instruct-v0.2-q4_0	JuliaRecapCoTTask	14.8	14.2	43.8	50.0	70.0	3.0
mistral:7b-instruct-v0.2-q4_0	JuliaRecapTask	16.2	15.4	47.3	50.0	70.0	2.9
openchat:7b-v3.5-1210-q4KM	JuliaRecapCoTTask	16.9	15.7	49.1	50.0	55.0	2.9
solar:10.7b-instruct-v1-q4KM	JuliaExpertAsk	13.0	12.5	36.9	50.0	57.0	2.8
codellama:70b-instruct-q2_K	JuliaExpertCoTTask	9.1	8.4	25.5	0.0	70.0	2.8
magicoder	JuliaExpertCoTTask	16.2	15.1	44.7	37.5	58.0	2.8
orca2:13b	InJulia	11.4	10.5	31.3	25.0	57.0	2.7
codellama:70b-instruct-q4KM	InJulia	16.5	14.7	43.9	50.0	70.0	2.7
codellama:70b-instruct-q2_K	InJulia	13.0	10.9	34.6	25.0	70.0	2.7
solar:10.7b-instruct-v1-q4KM	InJulia	17.4	15.7	43.9	50.0	57.0	2.5
stablelm-zephyr	JuliaExpertAsk	6.3	6.6	15.6	0.0	57.0	2.5
mistral:7b-instruct-v0.2-q4KM	JuliaExpertCoTTask	16.4	15.9	40.5	50.0	70.0	2.5
codellama:70b-instruct-q2_K	JuliaRecapCoTTask	12.3	9.7	30.0	0.0	70.0	2.4
codellama:13b-instruct	JuliaRecapTask	21.9	20.6	53.0	50.0	58.0	2.4
gemma:7b-instruct-q6_K	JuliaExpertAsk	10.5	6.2	25.3	25.0	70.0	2.4
dolphin-phi:2.7b-v2.6-q6_K	JuliaExpertAsk	6.8	6.3	16.1	0.0	56.0	2.4
openchat:7b-v3.5-1210-q4KM	JuliaExpertCoTTask	18.2	17.9	43.1	50.0	55.0	2.4
codellama:70b-instruct-q4KM	JuliaRecapTask	17.9	14.1	42.4	37.5	70.0	2.4
phind-codellama:34b-v2	JuliaExpertAsk	29.5	27.7	68.1	66.7	57.0	2.3
codellama:13b-instruct	JuliaRecapCoTTask	21.7	20.8	48.5	50.0	58.0	2.2
codellama:13b-instruct	JuliaExpertCoTTask	20.0	19.3	44.5	50.0	58.0	2.2
mistral:7b-instruct-v0.2-q6_K	InJulia	19.4	17.1	43.2	50.0	42.0	2.2
starling-lm:latest	JuliaExpertCoTTask	16.6	16.2	36.8	50.0	58.0	2.2
codellama:70b-instruct-q2_K	JuliaExpertAsk	9.8	8.8	21.3	0.0	70.0	2.2
mistral:7b-instruct-q4KM	JuliaExpertCoTTask	16.3	15.8	35.0	25.0	57.0	2.1
mistral:7b-instruct-v0.2-q4KM	JuliaRecapTask	20.5	18.7	44.0	50.0	70.0	2.1
codellama:70b-instruct-q4KM	JuliaRecapCoTTask	16.7	12.8	35.2	0.0	70.0	2.1
codellama:70b-instruct-q4KM	JuliaExpertCoTTask	14.8	13.3	30.8	0.0	70.0	2.1
mistral:7b-instruct-v0.2-q4KM	JuliaRecapCoTTask	18.9	17.9	38.5	50.0	70.0	2.0
yi:34b-chat	JuliaExpertAsk	26.1	22.8	52.7	52.5	58.0	2.0
mistral:7b-instruct-v0.2-q6_K	JuliaExpertCoTTask	23.8	25.1	47.5	50.0	42.0	2.0
mistral:7b-instruct-q4KM	JuliaRecapTask	16.7	15.9	33.0	25.0	55.0	2.0
codellama:70b-instruct-q4KM	JuliaExpertAsk	15.7	13.3	29.9	0.0	70.0	1.9
solar:10.7b-instruct-v1-q4KM	JuliaRecapCoTTask	19.7	19.1	36.7	50.0	57.0	1.9
solar:10.7b-instruct-v1-q4KM	JuliaRecapTask	21.3	21.0	38.9	50.0	57.0	1.8
mistral:7b-instruct-v0.2-q6_K	JuliaRecapCoTTask	26.9	24.7	48.2	50.0	42.0	1.8
phind-codellama:34b-v2	InJulia	33.2	34.3	59.0	61.2	57.0	1.8
dolphin-phi:2.7b-v2.6-q6_K	JuliaRecapTask	9.5	9.3	16.3	0.0	56.0	1.7
llama2	InJulia	15.3	13.9	26.4	25.0	59.0	1.7
mistral:7b-instruct-v0.2-q6_K	JuliaRecapTask	28.3	27.2	48.6	50.0	42.0	1.7
mistral:7b-instruct-q4KM	JuliaRecapCoTTask	18.7	17.6	31.5	50.0	55.0	1.7
phind-codellama:34b-v2	JuliaRecapCoTTask	37.1	36.9	59.8	61.2	57.0	1.6
stablelm-zephyr	JuliaRecapTask	12.1	8.3	19.2	0.0	57.0	1.6
stablelm-zephyr	InJulia	8.5	6.6	13.3	0.0	57.0	1.6
phind-codellama:34b-v2	JuliaRecapTask	41.1	40.6	62.0	61.2	57.0	1.5
dolphin-phi:2.7b-v2.6-q6_K	JuliaExpertCoTTask	8.1	8.0	12.2	0.0	56.0	1.5
dolphin-phi:2.7b-v2.6-q6_K	JuliaRecapCoTTask	9.4	8.9	14.1	0.0	56.0	1.5
orca2:13b	JuliaExpertAsk	11.0	9.2	16.5	0.0	57.0	1.5
dolphin-phi:2.7b-v2.6-q6_K	InJulia	10.6	9.4	15.6	0.0	56.0	1.5
stablelm-zephyr	JuliaRecapCoTTask	11.4	8.8	16.5	0.0	57.0	1.4
llama2	JuliaExpertCoTTask	18.9	17.3	27.2	25.0	59.0	1.4
gemma:7b-instruct-q6_K	JuliaRecapCoTTask	25.7	25.0	34.9	50.0	70.0	1.4
phind-codellama:34b-v2	JuliaExpertCoTTask	44.6	46.5	60.1	66.7	57.0	1.3
codellama:13b-python	JuliaRecapCoTTask	9.5	6.4	12.4	0.0	42.0	1.3
llama2	JuliaRecapCoTTask	19.3	19.2	25.0	25.0	59.0	1.3
codellama:13b-python	JuliaExpertAsk	10.4	7.9	13.3	0.0	44.0	1.3
nous-hermes2:34b-yi-q4KM	InJulia	52.0	45.5	61.8	60.0	67.0	1.2
gemma:7b-instruct-q6_K	InJulia	19.7	20.3	22.6	25.0	70.0	1.1
gemma:7b-instruct-q6_K	JuliaRecapTask	24.8	23.6	26.9	25.0	70.0	1.1
phi:2.7b-chat-v2-q6_K	JuliaExpertCoTTask	9.3	5.7	9.9	0.0	55.0	1.1
stablelm-zephyr	JuliaExpertCoTTask	11.4	9.6	12.2	0.0	57.0	1.1
nous-hermes2:34b-yi-q4KM	JuliaExpertAsk	35.9	32.9	37.4	50.0	67.0	1.0
codellama:13b-python	JuliaExpertCoTTask	12.8	12.9	13.3	0.0	43.0	1.0
llama2	JuliaRecapTask	22.1	22.2	22.4	0.0	59.0	1.0
orca2:13b	JuliaExpertCoTTask	23.9	23.1	24.2	0.0	57.0	1.0
yi:34b-chat	JuliaRecapTask	50.8	48.8	47.6	50.0	58.0	0.9
yi:34b-chat	JuliaExpertCoTTask	42.1	40.6	39.2	25.0	58.0	0.9
yi:34b-chat	JuliaRecapCoTTask	49.5	45.6	44.0	50.0	57.0	0.9
solar:10.7b-instruct-v1-q4KM	JuliaExpertCoTTask	22.5	22.4	19.7	0.0	58.0	0.9
yi:34b-chat	InJulia	51.1	48.6	44.5	50.0	58.0	0.9
codellama:13b-python	JuliaRecapTask	16.3	10.9	13.9	0.0	43.0	0.9
nous-hermes2:34b-yi-q4KM	JuliaRecapTask	67.6	61.9	56.6	50.0	65.0	0.8
gemma:7b-instruct-q6_K	JuliaExpertCoTTask	23.8	23.3	19.9	25.0	70.0	0.8
phi:2.7b-chat-v2-q6_K	JuliaRecapCoTTask	12.1	10.8	9.9	0.0	55.0	0.8
codellama:13b-python	InJulia	13.7	12.5	11.0	0.0	44.0	0.8
orca2:13b	JuliaRecapCoTTask	26.7	25.4	21.5	0.0	57.0	0.8
phi:2.7b-chat-v2-q6_K	JuliaRecapTask	13.6	12.9	10.8	0.0	55.0	0.8
orca2:13b	JuliaRecapTask	27.5	24.8	21.9	0.0	57.0	0.8
nous-hermes2:34b-yi-q4KM	JuliaRecapCoTTask	58.5	60.0	46.1	50.0	65.0	0.8
nous-hermes2:34b-yi-q4KM	JuliaExpertCoTTask	70.4	65.5	51.4	55.0	67.0	0.7
phi:2.7b-chat-v2-q6_K	JuliaExpertAsk	14.6	13.4	8.1	0.0	55.0	0.6
phi:2.7b-chat-v2-q6_K	InJulia	15.7	17.8	6.0	0.0	55.0	0.4

This page was generated using Literate.jl.