Results for Local LLM Models

The below captures the benchmark performance of the local models. Most of these were run through Ollama.ai on a consumer-grade laptop.

Please note that the below models vary in their "open-source-ness" (what has been actually released) and their licencing terms (what they can be used for). Be careful - some of the below models are for research purposes only (eg, Microsoft Phi).

Reminder: The below scores are on a scale 0-100, where 100 is the best possible score and 0 means the generated code was not even parseable.

# Imports
using JuliaLLMLeaderboard
using CairoMakie, AlgebraOfGraphics
using MarkdownTables, DataFramesMeta
using Statistics: mean, median, quantile, std;
unscrub_string(s::AbstractString) = split(s, "_") .|> titlecase |> x -> join(x, " ");

# ! Configuration
SAVE_PLOTS = false
DIR_RESULTS = joinpath(pkgdir(JuliaLLMLeaderboard), "code_generation")
PAID_MODELS_DEFAULT = [
    "gpt-3.5-turbo",
    "gpt-3.5-turbo-1106",
    "gpt-3.5-turbo-0125",
    "gpt-4-1106-preview",
    "gpt-4-0125-preview",
    "gpt-4-turbo-2024-04-09",
    "gpt-4o-2024-05-13",
    "mistral-tiny",
    "mistral-small",
    "mistral-medium",
    "mistral-large",
    "mistral-small-2402",
    "mistral-medium-2312",
    "mistral-large-2402",
    "claude-3-opus-20240229",
    "claude-3-sonnet-20240229",
    "claude-3-haiku-20240307",
    "claude-2.1",
    "gemini-1.0-pro-latest",
    "deepseek-chat",
    "deepseek-coder"
];
MODEL_SIZES = Dict("orca2:13b" => "10-29",
    "mistral:7b-instruct-v0.2-q4_0" => "4-9",
    "nous-hermes2:34b-yi-q4_K_M" => "30-69",
    "starling-lm:latest" => "4-9",
    "dolphin-phi:2.7b-v2.6-q6_K" => "<4",
    "stablelm-zephyr" => "<4",
    "codellama:13b-python" => "10-29",
    "magicoder:7b-s-cl-q6_K" => "4-9",
    "phi:2.7b-chat-v2-q6_K" => "<4",
    "magicoder" => "4-9",
    "mistral:7b-instruct-q4_K_M" => "4-9",
    "solar:10.7b-instruct-v1-q4_K_M" => "10-29",
    "codellama:13b-instruct" => "10-29",
    "openhermes2.5-mistral" => "4-9",
    "llama2" => "4-9",
    "yi:34b-chat" => "30-69",
    "deepseek-coder:33b-instruct-q4_K_M" => "30-69",
    "phind-codellama:34b-v2" => "30-69",
    "openchat:7b-v3.5-1210-q4_K_M" => "4-9",
    "mistral:7b-instruct-v0.2-q6_K" => "4-9",
    "mistral:7b-instruct-v0.2-q4_K_M" => "4-9",
    "codellama:13b-instruct-q4_K_M" => "10-29",
    "codellama:7b-instruct-q4_K_M" => "4-9",
    "codellama:34b-instruct-q4_K_M" => "30-69",
    "codellama:70b-instruct-q2_K" => ">70",
    "codellama:70b-instruct-q4_K_M" => ">70",
    "qwen:72b-chat-v1.5-q4_K_M" => ">70",
    "qwen:72b-chat-v1.5-q2_K" => ">70",
    "qwen:14b-chat-v1.5-q6_K" => "10-29",
    "qwen:14b-chat-v1.5-q4_K_M" => "10-29",
    "qwen:7b-chat-v1.5-q6_K" => "4-9",
    "qwen:7b-chat-v1.5-q4_K_M" => "4-9",
    "qwen:4b-chat-v1.5-q6_K" => "4-9",
    "gemma:7b-instruct-q6_K" => "4-9",
    "accounts/fireworks/models/dbrx-instruct" => ">70",
    "accounts/fireworks/models/mixtral-8x22b-instruct-preview" => ">70",
    "accounts/fireworks/models/qwen-72b-chat" => ">70",
    "meta-llama/Llama-3-8b-chat-hf" => "4-9",
    "meta-llama/Llama-3-70b-chat-hf" => ">70",
    "microsoft/WizardLM-2-8x22B" => ">70",
    "mistralai/Mixtral-8x22B-Instruct-v0.1" => ">70"
)
PROMPTS = [
    "JuliaExpertCoTTask",
    "JuliaExpertAsk",
    "InJulia",
    "JuliaRecapTask",
    "JuliaRecapCoTTask"
];
# Clean up fireworks names
function model_clean(model::AbstractString)
    model = occursin("fireworks", model) ?
            replace(model, "accounts/fireworks/models/" => "") * ("(Fireworks.ai)") : model
    model = occursin("meta-llama/", model) ?
            replace(model, "meta-llama/" => "") * ("(Together.ai)") : model
    model = occursin("mistralai/", model) ?
            replace(model, "mistralai/" => "") * ("(Together.ai)") : model
    model = occursin("microsoft/", model) ?
            replace(model, "microsoft/" => "") * ("(Together.ai)") : model
end
;

Load Results

Use only the 5 most recent evaluations available for each definition/model/prompt

df = @chain begin
    load_evals(DIR_RESULTS; max_history = 5)
    @rsubset !any(startswith.(:model, PAID_MODELS_DEFAULT)) && :prompt_label in PROMPTS
    # remove qwen models as they are not correct! But allow the accounts/fireworks models
    @rsubset !occursin("qwen", :model) || occursin("accounts", :model)
end;

Model Comparison

Highest average score by model:

fig = @chain df begin
    @by [:model] begin
        :cost = mean(:cost)
        :elapsed = mean(:elapsed_seconds)
        :score = mean(:score)
    end
    transform(_, names(_, Number) .=> ByRow(x -> round(x, digits = 1)), renamecols = false)
    @orderby -:score
    @rtransform :model_clean = model_clean(:model)
    @rtransform :size_group = MODEL_SIZES[:model]
    @aside local size_order = ["<4", "4-9", "10-29", "30-69", ">70"]
    @aside local order_ = _.model_clean
    data(_) *
    mapping(:model_clean => sorter(order_) => "Model",
        :score => "Avg. Score (Max 100 pts)",
        color = :size_group => sorter(size_order) => "Parameter Size (Bn)") *
    visual(BarPlot; bar_labels = :y, label_offset = 0, label_rotation = 1)
    draw(;
        figure = (; size = (900, 600)),
        legend = (; position = :bottom),
        axis = (;
            xautolimitmargin = (0.1, 0.05),
            limits = (nothing, nothing, 0, 100),
            xticklabelrotation = 45,
            title = "Open-Source LLM Model Performance"))
end
SAVE_PLOTS && save("assets/model-comparison-local.png", fig)
fig

Table:

output = @chain df begin
    @by [:model] begin
        :elapsed = mean(:elapsed_seconds)
        :elapsed_median = median(:elapsed_seconds)
        :score = mean(:score)
        :score_median = median(:score)
        :score_std_deviation = std(:score)
        :count_zero_score = count(iszero, :score)
        :count_full_score = count(==(100), :score)
    end
    transform(_, names(_, Number) .=> ByRow(x -> round(x, digits = 1)), renamecols = false)
    @orderby -:score
    @rtransform :model = model_clean(:model)
    rename(_, names(_) .|> unscrub_string)
end
# markdown_table(output, String) |> clipboard
markdown_table(output)
ModelElapsedElapsed MedianScoreScore MedianScore Std DeviationCount Zero ScoreCount Full Score
Mixtral-8x22B-Instruct-v0.1(Together.ai)14.111.077.690.025.85.0151.0
Llama-3-70b-chat-hf(Together.ai)4.34.176.888.325.20.0160.0
Llama-3-8b-chat-hf(Together.ai)1.51.467.766.726.45.070.0
WizardLM-2-8x22B(Together.ai)34.731.062.760.033.833.0118.0
phind-codellama:34b-v237.136.461.862.533.536.058.0
magicoder:7b-s-cl-q6_K15.615.859.960.029.918.035.0
codellama:13b-instruct-q4KM3.23.056.454.633.056.061.0
magicoder12.810.753.750.033.249.052.0
nous-hermes2:34b-yi-q4KM56.852.850.750.034.778.056.0
dbrx-instruct(Fireworks.ai)3.73.650.050.041.2121.075.0
codellama:13b-instruct18.116.750.050.034.465.044.0
openchat:7b-v3.5-1210-q4KM14.413.749.450.030.348.023.0
openhermes2.5-mistral12.912.248.950.031.355.027.0
starling-lm:latest13.712.548.450.030.258.026.0
codellama:7b-instruct-q4KM2.12.047.850.035.395.038.0
qwen-72b-chat(Fireworks.ai)3.23.845.950.038.8117.063.0
yi:34b-chat43.941.345.650.030.545.034.0
mistral:7b-instruct-v0.2-q6_K21.720.945.450.031.344.023.0
mistral:7b-instruct-v0.2-q4_012.412.344.350.030.675.032.0
mistral:7b-instruct-v0.2-q4KM15.615.142.650.028.671.023.0
codellama:34b-instruct-q4KM7.56.839.750.036.1127.035.0
codellama:70b-instruct-q4KM16.313.836.40.041.2179.058.0
solar:10.7b-instruct-v1-q4KM18.817.735.250.031.1107.010.0
mistral:7b-instruct-q4KM13.913.034.850.026.580.00.0
codellama:70b-instruct-q2_K11.29.429.80.037.7198.029.0
llama217.116.326.525.026.5131.00.0
gemma:7b-instruct-q6_K20.922.125.925.025.2147.02.0
orca2:13b20.118.323.10.030.6166.011.0
stablelm-zephyr9.97.715.40.023.5192.01.0
dolphin-phi:2.7b-v2.6-q6_K8.98.414.90.022.9188.00.0
codellama:13b-python12.510.712.80.022.1155.00.0
phi:2.7b-chat-v2-q6_K13.011.68.90.019.4222.00.0

Note that our sample size is low, so the rankings could easily change (we have high standard deviations of the estimated means). That the results only as indicative.

Overview by Prompt Template

Bar chart with all local models and various prompt templates

fig = @chain df begin
    @by [:model, :prompt_label] begin
        :cost = mean(:cost)
        :elapsed = mean(:elapsed_seconds)
        :score = mean(:score)
        :score_median = median(:score)
        :cnt = $nrow
    end
    @rtransform :model = model_clean(:model)
    @aside local average_ = @by(_, :model, :avg=mean(:score)) |>
                            x -> @orderby(x, -:avg).model
    data(_) *
    mapping(:model => sorter(average_) => "Model",
        :score => "Avg. Score (Max 100 pts)",
        color = :prompt_label => "Prompts",
        dodge = :prompt_label) * visual(BarPlot)
    draw(; figure = (size = (900, 600),),
        axis = (xautolimitmargin = (0.1, 0.05), xticklabelrotation = 45,
            title = "Comparison for Local Models"),
        legend = (; position = :bottom))
end
SAVE_PLOTS && save("assets/model-prompt-comparison-local.png", fig)
fig

Table:

output = @chain df begin
    @by [:model, :prompt_label] begin
        :cost = mean(:cost)
        :elapsed = mean(:elapsed_seconds)
        :score = mean(:score)
    end
    @aside average_ = @by _ :model :AverageScore=mean(:score) |> x -> round(x, digits = 1)
    unstack(:model, :prompt_label, :score; fill = 0.0)
    transform(_, names(_, Number) .=> ByRow(x -> round(x, digits = 1)), renamecols = false)
    leftjoin(average_, on = :model)
    @orderby -:AverageScore
end
# markdown_table(output, String) |> clipboard
markdown_table(output)
modelInJuliaJuliaExpertAskJuliaExpertCoTTaskJuliaRecapCoTTaskJuliaRecapTaskAverageScore
mistralai/Mixtral-8x22B-Instruct-v0.177.879.973.182.574.877.6
meta-llama/Llama-3-70b-chat-hf75.582.975.976.773.276.8
meta-llama/Llama-3-8b-chat-hf65.571.868.065.767.667.7
microsoft/WizardLM-2-8x22B52.867.170.863.559.162.7
phind-codellama:34b-v259.068.160.159.862.061.8
magicoder:7b-s-cl-q6_K62.260.152.859.465.259.9
codellama:13b-instruct-q4KM67.663.442.852.555.856.4
magicoder60.850.244.756.656.353.7
nous-hermes2:34b-yi-q4KM61.837.451.446.156.650.7
accounts/fireworks/models/dbrx-instruct56.364.347.340.841.650.0
codellama:13b-instruct53.251.144.548.553.050.0
openchat:7b-v3.5-1210-q4KM50.951.043.149.152.949.4
openhermes2.5-mistral49.650.751.940.951.348.9
starling-lm:latest51.155.536.846.152.648.4
codellama:7b-instruct-q4KM57.733.132.655.460.447.8
accounts/fireworks/models/qwen-72b-chat59.751.449.227.242.245.9
yi:34b-chat44.552.739.244.047.645.6
mistral:7b-instruct-v0.2-q6_K43.239.847.548.248.645.4
mistral:7b-instruct-v0.2-q4_047.440.342.843.847.344.3
mistral:7b-instruct-v0.2-q4KM41.848.440.538.544.042.6
codellama:34b-instruct-q4KM50.153.034.829.031.639.7
codellama:70b-instruct-q4KM43.929.930.835.242.436.4
solar:10.7b-instruct-v1-q4KM43.936.919.736.738.935.2
mistral:7b-instruct-q4KM37.337.335.031.533.034.8
codellama:70b-instruct-q2_K34.621.325.530.037.429.8
llama226.431.427.225.022.426.5
gemma:7b-instruct-q6_K22.625.319.934.926.925.9
orca2:13b31.316.524.221.521.923.1
stablelm-zephyr13.315.612.216.519.215.4
dolphin-phi:2.7b-v2.6-q6_K15.616.112.214.116.314.9
codellama:13b-python11.013.313.312.413.912.8
phi:2.7b-chat-v2-q6_K6.08.19.99.910.88.9

Other Considerations

Comparison of Time-to-generate vs Average Score Removed any HOSTED model (that's why you don't see some models that are in other plots).

fig = @chain df begin
    @rsubset !occursin("HOSTED", :device)
    @aside local xlims = quantile(df.elapsed_seconds, [0.01, 0.99])
    @by [:model, :prompt_label] begin
        :elapsed = mean(:elapsed_seconds)
        :elapsed_median = median(:elapsed_seconds)
        :score = mean(:score)
        :score_median = median(:score)
        :cnt = $nrow
    end
    data(_) * mapping(:elapsed => "Avg. Elapsed Time (s)",
        :score => "Avg. Score (Max 100 pts)",
        color = :model => "Model")
    draw(; figure = (size = (800, 900),),
        axis = (xautolimitmargin = (0.1, 0.05), xticklabelrotation = 45,
            title = "Elapsed Time vs Score for Local Models",
            limits = (xlims..., nothing, nothing)),
        palettes = (; color = Makie.ColorSchemes.tab20.colors))
end
SAVE_PLOTS && save("assets/elapsed-vs-score-scatter-local.png", fig)
fig

Table:

  • Point per second is the average score divided by the average elapsed time
output = @chain df begin
    @rsubset !occursin("HOSTED", :device)
    @by [:model, :prompt_label] begin
        :elapsed = mean(:elapsed_seconds)
        :elapsed_median = median(:elapsed_seconds)
        :score_avg = mean(:score)
        :score_median = median(:score)
        :cnt = $nrow
    end
    @rtransform :point_per_second = :score_avg / :elapsed
    @orderby -:point_per_second
    #
    transform(_,
        names(_, Not(:model, :prompt_label)) .=> ByRow(x -> round(x, digits = 1)),
        renamecols = false)
    rename(_, names(_) .|> unscrub_string)
end
# markdown_table(output, String) |> clipboard
markdown_table(output)
ModelPrompt LabelElapsedElapsed MedianScore AvgScore MedianCntPoint Per Second
codellama:13b-instruct-q4KMJuliaExpertAsk2.01.963.475.070.032.1
codellama:7b-instruct-q4KMInJulia2.02.057.755.070.029.1
codellama:7b-instruct-q4KMJuliaExpertAsk1.20.933.10.070.026.5
codellama:7b-instruct-q4KMJuliaRecapTask2.62.560.460.070.023.3
codellama:7b-instruct-q4KMJuliaExpertCoTTask1.61.432.60.070.020.6
codellama:13b-instruct-q4KMInJulia3.63.467.661.270.018.7
codellama:7b-instruct-q4KMJuliaRecapCoTTask3.02.755.450.070.018.3
codellama:13b-instruct-q4KMJuliaExpertCoTTask2.72.342.850.070.015.9
codellama:13b-instruct-q4KMJuliaRecapTask3.93.655.850.070.014.2
codellama:13b-instruct-q4KMJuliaRecapCoTTask3.94.152.550.070.013.5
codellama:34b-instruct-q4KMJuliaExpertAsk6.35.853.050.070.08.4
mistral:7b-instruct-v0.2-q4_0JuliaExpertAsk5.85.640.350.070.06.9
openchat:7b-v3.5-1210-q4KMJuliaExpertAsk7.86.951.050.055.06.5
codellama:34b-instruct-q4KMInJulia8.17.950.150.070.06.2
mistral:7b-instruct-v0.2-q4KMJuliaExpertAsk8.16.948.450.070.06.0
openhermes2.5-mistralJuliaExpertAsk8.78.950.752.558.05.8
starling-lm:latestJuliaExpertAsk9.99.855.550.058.05.6
magicoderInJulia11.09.660.860.057.05.6
codellama:34b-instruct-q4KMJuliaExpertCoTTask6.46.434.825.070.05.4
magicoderJuliaExpertAsk9.88.750.250.058.05.1
codellama:13b-instructJuliaExpertAsk10.48.551.150.058.04.9
mistral:7b-instruct-q4KMJuliaExpertAsk7.77.437.350.057.04.9
openhermes2.5-mistralInJulia10.89.849.650.058.04.6
starling-lm:latestInJulia11.111.151.150.058.04.6
openchat:7b-v3.5-1210-q4KMInJulia11.911.750.950.055.04.3
magicoder:7b-s-cl-q6_KInJulia14.615.362.255.042.04.3
magicoderJuliaRecapCoTTask13.412.056.650.057.04.2
magicoderJuliaRecapTask13.410.956.350.057.04.2
codellama:34b-instruct-q4KMJuliaRecapTask7.66.231.60.070.04.2
magicoder:7b-s-cl-q6_KJuliaExpertAsk14.614.760.158.142.04.1
magicoder:7b-s-cl-q6_KJuliaRecapTask16.116.565.260.042.04.1
mistral:7b-instruct-v0.2-q6_KJuliaExpertAsk10.08.239.850.042.04.0
mistral:7b-instruct-v0.2-q4_0InJulia12.111.347.450.070.03.9
mistral:7b-instruct-q4KMInJulia10.310.037.350.057.03.6
magicoder:7b-s-cl-q6_KJuliaRecapCoTTask16.916.459.455.642.03.5
openhermes2.5-mistralJuliaRecapTask15.014.751.350.058.03.4
magicoder:7b-s-cl-q6_KJuliaExpertCoTTask16.016.552.858.142.03.3
starling-lm:latestJuliaRecapTask16.013.652.650.058.03.3
codellama:34b-instruct-q4KMJuliaRecapCoTTask8.87.429.012.570.03.3
codellama:13b-instructInJulia16.414.753.250.058.03.2
mistral:7b-instruct-v0.2-q4_0JuliaExpertCoTTask13.212.942.850.070.03.2
codellama:70b-instruct-q2_KJuliaRecapTask11.79.537.425.070.03.2
llama2JuliaExpertAsk9.89.131.450.059.03.2
openhermes2.5-mistralJuliaExpertCoTTask16.616.051.950.057.03.1
starling-lm:latestJuliaRecapCoTTask14.813.246.150.058.03.1
openhermes2.5-mistralJuliaRecapCoTTask13.313.540.950.058.03.1
openchat:7b-v3.5-1210-q4KMJuliaRecapTask17.215.852.950.055.03.1
mistral:7b-instruct-v0.2-q4KMInJulia14.113.941.850.070.03.0
mistral:7b-instruct-v0.2-q4_0JuliaRecapCoTTask14.814.243.850.070.03.0
mistral:7b-instruct-v0.2-q4_0JuliaRecapTask16.215.447.350.070.02.9
openchat:7b-v3.5-1210-q4KMJuliaRecapCoTTask16.915.749.150.055.02.9
solar:10.7b-instruct-v1-q4KMJuliaExpertAsk13.012.536.950.057.02.8
codellama:70b-instruct-q2_KJuliaExpertCoTTask9.18.425.50.070.02.8
magicoderJuliaExpertCoTTask16.215.144.737.558.02.8
orca2:13bInJulia11.410.531.325.057.02.7
codellama:70b-instruct-q4KMInJulia16.514.743.950.070.02.7
codellama:70b-instruct-q2_KInJulia13.010.934.625.070.02.7
solar:10.7b-instruct-v1-q4KMInJulia17.415.743.950.057.02.5
stablelm-zephyrJuliaExpertAsk6.36.615.60.057.02.5
mistral:7b-instruct-v0.2-q4KMJuliaExpertCoTTask16.415.940.550.070.02.5
codellama:70b-instruct-q2_KJuliaRecapCoTTask12.39.730.00.070.02.4
codellama:13b-instructJuliaRecapTask21.920.653.050.058.02.4
gemma:7b-instruct-q6_KJuliaExpertAsk10.56.225.325.070.02.4
dolphin-phi:2.7b-v2.6-q6_KJuliaExpertAsk6.86.316.10.056.02.4
openchat:7b-v3.5-1210-q4KMJuliaExpertCoTTask18.217.943.150.055.02.4
codellama:70b-instruct-q4KMJuliaRecapTask17.914.142.437.570.02.4
phind-codellama:34b-v2JuliaExpertAsk29.527.768.166.757.02.3
codellama:13b-instructJuliaRecapCoTTask21.720.848.550.058.02.2
codellama:13b-instructJuliaExpertCoTTask20.019.344.550.058.02.2
mistral:7b-instruct-v0.2-q6_KInJulia19.417.143.250.042.02.2
starling-lm:latestJuliaExpertCoTTask16.616.236.850.058.02.2
codellama:70b-instruct-q2_KJuliaExpertAsk9.88.821.30.070.02.2
mistral:7b-instruct-q4KMJuliaExpertCoTTask16.315.835.025.057.02.1
mistral:7b-instruct-v0.2-q4KMJuliaRecapTask20.518.744.050.070.02.1
codellama:70b-instruct-q4KMJuliaRecapCoTTask16.712.835.20.070.02.1
codellama:70b-instruct-q4KMJuliaExpertCoTTask14.813.330.80.070.02.1
mistral:7b-instruct-v0.2-q4KMJuliaRecapCoTTask18.917.938.550.070.02.0
yi:34b-chatJuliaExpertAsk26.122.852.752.558.02.0
mistral:7b-instruct-v0.2-q6_KJuliaExpertCoTTask23.825.147.550.042.02.0
mistral:7b-instruct-q4KMJuliaRecapTask16.715.933.025.055.02.0
codellama:70b-instruct-q4KMJuliaExpertAsk15.713.329.90.070.01.9
solar:10.7b-instruct-v1-q4KMJuliaRecapCoTTask19.719.136.750.057.01.9
solar:10.7b-instruct-v1-q4KMJuliaRecapTask21.321.038.950.057.01.8
mistral:7b-instruct-v0.2-q6_KJuliaRecapCoTTask26.924.748.250.042.01.8
phind-codellama:34b-v2InJulia33.234.359.061.257.01.8
dolphin-phi:2.7b-v2.6-q6_KJuliaRecapTask9.59.316.30.056.01.7
llama2InJulia15.313.926.425.059.01.7
mistral:7b-instruct-v0.2-q6_KJuliaRecapTask28.327.248.650.042.01.7
mistral:7b-instruct-q4KMJuliaRecapCoTTask18.717.631.550.055.01.7
phind-codellama:34b-v2JuliaRecapCoTTask37.136.959.861.257.01.6
stablelm-zephyrJuliaRecapTask12.18.319.20.057.01.6
stablelm-zephyrInJulia8.56.613.30.057.01.6
phind-codellama:34b-v2JuliaRecapTask41.140.662.061.257.01.5
dolphin-phi:2.7b-v2.6-q6_KJuliaExpertCoTTask8.18.012.20.056.01.5
dolphin-phi:2.7b-v2.6-q6_KJuliaRecapCoTTask9.48.914.10.056.01.5
orca2:13bJuliaExpertAsk11.09.216.50.057.01.5
dolphin-phi:2.7b-v2.6-q6_KInJulia10.69.415.60.056.01.5
stablelm-zephyrJuliaRecapCoTTask11.48.816.50.057.01.4
llama2JuliaExpertCoTTask18.917.327.225.059.01.4
gemma:7b-instruct-q6_KJuliaRecapCoTTask25.725.034.950.070.01.4
phind-codellama:34b-v2JuliaExpertCoTTask44.646.560.166.757.01.3
codellama:13b-pythonJuliaRecapCoTTask9.56.412.40.042.01.3
llama2JuliaRecapCoTTask19.319.225.025.059.01.3
codellama:13b-pythonJuliaExpertAsk10.47.913.30.044.01.3
nous-hermes2:34b-yi-q4KMInJulia52.045.561.860.067.01.2
gemma:7b-instruct-q6_KInJulia19.720.322.625.070.01.1
gemma:7b-instruct-q6_KJuliaRecapTask24.823.626.925.070.01.1
phi:2.7b-chat-v2-q6_KJuliaExpertCoTTask9.35.79.90.055.01.1
stablelm-zephyrJuliaExpertCoTTask11.49.612.20.057.01.1
nous-hermes2:34b-yi-q4KMJuliaExpertAsk35.932.937.450.067.01.0
codellama:13b-pythonJuliaExpertCoTTask12.812.913.30.043.01.0
llama2JuliaRecapTask22.122.222.40.059.01.0
orca2:13bJuliaExpertCoTTask23.923.124.20.057.01.0
yi:34b-chatJuliaRecapTask50.848.847.650.058.00.9
yi:34b-chatJuliaExpertCoTTask42.140.639.225.058.00.9
yi:34b-chatJuliaRecapCoTTask49.545.644.050.057.00.9
solar:10.7b-instruct-v1-q4KMJuliaExpertCoTTask22.522.419.70.058.00.9
yi:34b-chatInJulia51.148.644.550.058.00.9
codellama:13b-pythonJuliaRecapTask16.310.913.90.043.00.9
nous-hermes2:34b-yi-q4KMJuliaRecapTask67.661.956.650.065.00.8
gemma:7b-instruct-q6_KJuliaExpertCoTTask23.823.319.925.070.00.8
phi:2.7b-chat-v2-q6_KJuliaRecapCoTTask12.110.89.90.055.00.8
codellama:13b-pythonInJulia13.712.511.00.044.00.8
orca2:13bJuliaRecapCoTTask26.725.421.50.057.00.8
phi:2.7b-chat-v2-q6_KJuliaRecapTask13.612.910.80.055.00.8
orca2:13bJuliaRecapTask27.524.821.90.057.00.8
nous-hermes2:34b-yi-q4KMJuliaRecapCoTTask58.560.046.150.065.00.8
nous-hermes2:34b-yi-q4KMJuliaExpertCoTTask70.465.551.455.067.00.7
phi:2.7b-chat-v2-q6_KJuliaExpertAsk14.613.48.10.055.00.6
phi:2.7b-chat-v2-q6_KInJulia15.717.86.00.055.00.4

This page was generated using Literate.jl.