Comparison of Paid LLM APIs vs Local Models

While several locally-hosted models are very impressive, there is still a big gap in performance.

This gap would be smaller if we could run any of the 70bn models locally, unfortunately, that’s not the case for me.

Reminder: The below scores are on a scale 0-100, where 100 is the best possible score and 0 means the generated code was not even parseable.

# Imports
using JuliaLLMLeaderboard
using CairoMakie, AlgebraOfGraphics
using MarkdownTables, DataFramesMeta
using Statistics: mean, median, quantile;
unscrub_string(s::AbstractString) = split(s, "_") .|> titlecase |> x -> join(x, " ");

# ! Configuration
SAVE_PLOTS = false
DIR_RESULTS = joinpath(pkgdir(JuliaLLMLeaderboard), "code_generation")
PAID_MODELS_DEFAULT = [
    "gpt-3.5-turbo",
    "gpt-3.5-turbo-1106",
    "gpt-3.5-turbo-0125",
    "gpt-4-1106-preview",
    "gpt-4-0125-preview",
    "gpt-4-turbo-2024-04-09",
    "gpt-4o-2024-05-13",
    "gpt-4o-mini-2024-07-18",
    "gpt-4o-2024-08-06",
    "mistral-tiny",
    "mistral-small",
    "mistral-medium",
    "mistral-large",
    "mistral-small-2402",
    "mistral-medium-2312",
    "mistral-large-2402",
    "claude-3-opus-20240229",
    "claude-3-sonnet-20240229",
    "claude-3-haiku-20240307",
    "claude-3-5-sonnet-20240620",
    "claude-2.1",
    "gemini-1.0-pro-latest",
    "deepseek-chat",
    "deepseek-coder",
    "codestral-2405",
    "mistral-large-2407"
];
PROMPTS = [
    "JuliaExpertCoTTask",
    "JuliaExpertAsk",
    "InJulia",
    "JuliaRecapTask",
    "JuliaRecapCoTTask"
];

Load Latest Results

Use only the 5 most recent evaluations available for each definition/model/prompt

df = @chain begin
    load_evals(DIR_RESULTS; max_history = 5)
    @rsubset :prompt_label in PROMPTS
    # remove qwen models as they are not correct!
    @rsubset !occursin("qwen", :model)
end;

Comparison by Model

Highest average score by model:

fig = @chain df begin
    @by [:model] begin
        :cost = mean(:cost)
        :elapsed = mean(:elapsed_seconds)
        :score = mean(:score)
    end
    @orderby -:score
    @rtransform :is_paid = :model in PAID_MODELS_DEFAULT
    @rsubset !endswith(:model, "--optim")
    @aside local order_ = _.model
    data(_) *
    mapping(:model => sorter(order_) => "Model",
        :score => "Avg. Score (Max 100 pts)",
        color = :is_paid => "Paid API or Locally-hosted") *
    visual(BarPlot; bar_labels = :y, label_offset = 0, label_formatter = x -> round(Int, x))
    draw(;
        figure = (; size = (900, 600)),
        legend = (; position = :bottom),
        axis = (;
            limits = (nothing, nothing, 0, 100),
            xticklabelrotation = 45,
            title = "LLM Model Performance [PRELIMINARY]"))
end
fig

Table:

output = @chain df begin
    @by [:model] begin
        :elapsed = mean(:elapsed_seconds)
        :elapsed_median = mean(:elapsed_seconds)
        :score = mean(:score)
        :score_median = median(:score)
        :count_zero_score = count(iszero, :score)
        :count_full_score = count(==(100), :score)
    end
    @rtransform :is_paid = :model in PAID_MODELS_DEFAULT
    @rsubset !endswith(:model, "--optim")
    transform(_,
        names(_, Not(:model, :is_paid)) .=> ByRow(x -> round(x, digits = 1)),
        renamecols = false)
    @orderby -:score
    rename(_, names(_) .|> unscrub_string)
end
# markdown_table(output, String) |> clipboard
markdown_table(output)
ModelElapsedElapsed MedianScoreScore MedianCount Zero ScoreCount Full ScoreIs Paid
claude-3-5-sonnet-202406206.36.386.0100.05.0179.0true
claude-3-opus-2024022920.520.583.090.01.0161.0true
claude-3-sonnet-202402298.78.779.095.015.0161.0true
codestral-24051.91.978.095.016.0146.0true
mistralai/Mixtral-8x22B-Instruct-v0.114.114.177.690.05.0151.0false
gpt-4o-2024-08-064.74.777.390.012.0155.0true
meta-llama/Llama-3-70b-chat-hf4.34.376.888.30.0160.0false
gpt-4-turbo-2024-04-0910.910.974.990.022.0146.0true
gpt-4-1106-preview22.422.474.490.019.0142.0true
claude-3-haiku-202403074.04.074.184.24.0125.0true
mistral-large-240711.311.373.683.115.0137.0true
gpt-4o-mini-2024-07-185.25.273.586.719.0137.0true
gpt-4-0125-preview30.230.273.188.826.0140.0true
gpt-4o-2024-05-134.34.372.286.721.0122.0true
deepseek-coder13.013.071.683.339.0115.0true
deepseek-chat17.917.971.380.630.0138.0true
mistral-large-24028.68.671.180.05.0103.0true
meta-llama/Llama-3-8b-chat-hf1.51.567.766.75.070.0false
claude-2.110.110.167.375.027.0116.0true
microsoft/WizardLM-2-8x22B34.734.762.760.033.0118.0false
gpt-3.5-turbo-01251.21.262.167.162.095.0true
phind-codellama:34b-v237.137.161.862.536.058.0false
mistral-medium18.118.160.860.022.090.0true
mistral-small-24025.05.060.255.015.084.0true
mistral-small5.95.960.155.027.076.0true
magicoder:7b-s-cl-q6_K15.615.659.960.018.035.0false
gpt-3.5-turbo-11062.12.158.462.582.097.0true
codellama:13b-instruct-q4KM3.23.256.454.656.061.0false
deepseek-coder:33b-instruct-q4KM46.746.755.050.062.068.0false
magicoder12.812.853.750.049.052.0false
nous-hermes2:34b-yi-q4KM56.856.850.750.078.056.0false
accounts/fireworks/models/dbrx-instruct3.73.750.050.0121.075.0false
codellama:13b-instruct18.118.150.050.065.044.0false
openchat:7b-v3.5-1210-q4KM14.414.449.450.048.023.0false
openhermes2.5-mistral12.912.948.950.055.027.0false
starling-lm:latest13.713.748.450.058.026.0false
codellama:7b-instruct-q4KM2.12.147.850.095.038.0false
mistral-tiny4.64.646.950.075.042.0true
yi:34b-chat43.943.945.650.045.034.0false
mistral:7b-instruct-v0.2-q6_K21.721.745.450.044.023.0false
mistral:7b-instruct-v0.2-q4_012.412.444.350.075.032.0false
mistral:7b-instruct-v0.2-q4KM15.615.642.650.071.023.0false
gpt-3.5-turbo3.63.642.350.0132.054.0true
codellama:34b-instruct-q4KM7.57.539.750.0127.035.0false
codellama:70b-instruct-q4KM16.316.336.40.0179.058.0false
gemini-1.0-pro-latest4.24.235.950.076.09.0true
solar:10.7b-instruct-v1-q4KM18.818.835.250.0107.010.0false
mistral:7b-instruct-q4KM13.913.934.850.080.00.0false
codellama:70b-instruct-q2_K11.211.229.80.0198.029.0false
llama217.117.126.525.0131.00.0false
gemma:7b-instruct-q6_K20.920.925.925.0147.02.0false
orca2:13b20.120.123.10.0166.011.0false
stablelm-zephyr9.99.915.40.0192.01.0false
dolphin-phi:2.7b-v2.6-q6_K8.98.914.90.0188.00.0false
codellama:13b-python12.512.512.80.0155.00.0false
phi:2.7b-chat-v2-q6_K13.013.08.90.0222.00.0false

This page was generated using Literate.jl.