Comparison of Paid LLM APIs vs Local Models

While several locally-hosted models are very impressive, there is still a big gap in performance.

This gap would be smaller if we could run any of the 70bn models locally, unfortunately, that’s not the case for me.

Reminder: The below scores are on a scale 0-100, where 100 is the best possible score and 0 means the generated code was not even parseable.

# Imports
using JuliaLLMLeaderboard
using CairoMakie, AlgebraOfGraphics
using MarkdownTables, DataFramesMeta
using Statistics: mean, median, quantile;
unscrub_string(s::AbstractString) = split(s, "_") .|> titlecase |> x -> join(x, " ");

# ! Configuration
SAVE_PLOTS = false
DIR_RESULTS = joinpath(pkgdir(JuliaLLMLeaderboard), "code_generation")
PAID_MODELS_DEFAULT = [
    "gpt-3.5-turbo",
    "gpt-3.5-turbo-1106",
    "gpt-3.5-turbo-0125",
    "gpt-4-1106-preview",
    "gpt-4-0125-preview",
    "gpt-4-turbo-2024-04-09",
    "gpt-4o-2024-05-13",
    "gpt-4o-mini-2024-07-18",
    "gpt-4o-2024-08-06",
    "mistral-tiny",
    "mistral-small",
    "mistral-medium",
    "mistral-large",
    "mistral-small-2402",
    "mistral-medium-2312",
    "mistral-large-2402",
    "claude-3-opus-20240229",
    "claude-3-sonnet-20240229",
    "claude-3-haiku-20240307",
    "claude-3-5-sonnet-20240620",
    "claude-2.1",
    "gemini-1.0-pro-latest",
    "deepseek-chat",
    "deepseek-coder",
    "codestral-2405",
    "mistral-large-2407"
];
PROMPTS = [
    "JuliaExpertCoTTask",
    "JuliaExpertAsk",
    "InJulia",
    "JuliaRecapTask",
    "JuliaRecapCoTTask"
];

Load Latest Results

Use only the 5 most recent evaluations available for each definition/model/prompt

df = @chain begin
    load_evals(DIR_RESULTS; max_history = 5)
    @rsubset :prompt_label in PROMPTS
    # remove qwen models as they are not correct!
    @rsubset !occursin("qwen", :model)
end;

Comparison by Model

Highest average score by model:

fig = @chain df begin
    @by [:model] begin
        :cost = mean(:cost)
        :elapsed = mean(:elapsed_seconds)
        :score = mean(:score)
    end
    @orderby -:score
    @rtransform :is_paid = :model in PAID_MODELS_DEFAULT
    @rsubset !endswith(:model, "--optim")
    @aside local order_ = _.model
    data(_) *
    mapping(:model => sorter(order_) => "Model",
        :score => "Avg. Score (Max 100 pts)",
        color = :is_paid => "Paid API or Locally-hosted") *
    visual(BarPlot; bar_labels = :y, label_offset = 0, label_formatter = x -> round(Int, x))
    draw(;
        figure = (; size = (900, 600)),
        legend = (; position = :bottom),
        axis = (;
            limits = (nothing, nothing, 0, 100),
            xticklabelrotation = 45,
            title = "LLM Model Performance [PRELIMINARY]"))
end
fig

Table:

output = @chain df begin
    @by [:model] begin
        :elapsed = mean(:elapsed_seconds)
        :elapsed_median = mean(:elapsed_seconds)
        :score = mean(:score)
        :score_median = median(:score)
        :count_zero_score = count(iszero, :score)
        :count_full_score = count(==(100), :score)
    end
    @rtransform :is_paid = :model in PAID_MODELS_DEFAULT
    @rsubset !endswith(:model, "--optim")
    transform(_,
        names(_, Not(:model, :is_paid)) .=> ByRow(x -> round(x, digits = 1)),
        renamecols = false)
    @orderby -:score
    rename(_, names(_) .|> unscrub_string)
end
# markdown_table(output, String) |> clipboard
markdown_table(output)

Model	Elapsed	Elapsed Median	Score	Score Median	Count Zero Score	Count Full Score	Is Paid
claude-3-5-sonnet-20240620	6.3	6.3	86.0	100.0	5.0	179.0	true
claude-3-opus-20240229	20.5	20.5	83.0	90.0	1.0	161.0	true
claude-3-sonnet-20240229	8.7	8.7	79.0	95.0	15.0	161.0	true
codestral-2405	1.9	1.9	78.0	95.0	16.0	146.0	true
mistralai/Mixtral-8x22B-Instruct-v0.1	14.1	14.1	77.6	90.0	5.0	151.0	false
gpt-4o-2024-08-06	4.7	4.7	77.3	90.0	12.0	155.0	true
meta-llama/Llama-3-70b-chat-hf	4.3	4.3	76.8	88.3	0.0	160.0	false
gpt-4-turbo-2024-04-09	10.9	10.9	74.9	90.0	22.0	146.0	true
gpt-4-1106-preview	22.4	22.4	74.4	90.0	19.0	142.0	true
claude-3-haiku-20240307	4.0	4.0	74.1	84.2	4.0	125.0	true
mistral-large-2407	11.3	11.3	73.6	83.1	15.0	137.0	true
gpt-4o-mini-2024-07-18	5.2	5.2	73.5	86.7	19.0	137.0	true
gpt-4-0125-preview	30.2	30.2	73.1	88.8	26.0	140.0	true
gpt-4o-2024-05-13	4.3	4.3	72.2	86.7	21.0	122.0	true
deepseek-coder	13.0	13.0	71.6	83.3	39.0	115.0	true
deepseek-chat	17.9	17.9	71.3	80.6	30.0	138.0	true
mistral-large-2402	8.6	8.6	71.1	80.0	5.0	103.0	true
meta-llama/Llama-3-8b-chat-hf	1.5	1.5	67.7	66.7	5.0	70.0	false
claude-2.1	10.1	10.1	67.3	75.0	27.0	116.0	true
microsoft/WizardLM-2-8x22B	34.7	34.7	62.7	60.0	33.0	118.0	false
gpt-3.5-turbo-0125	1.2	1.2	62.1	67.1	62.0	95.0	true
phind-codellama:34b-v2	37.1	37.1	61.8	62.5	36.0	58.0	false
mistral-medium	18.1	18.1	60.8	60.0	22.0	90.0	true
mistral-small-2402	5.0	5.0	60.2	55.0	15.0	84.0	true
mistral-small	5.9	5.9	60.1	55.0	27.0	76.0	true
magicoder:7b-s-cl-q6_K	15.6	15.6	59.9	60.0	18.0	35.0	false
gpt-3.5-turbo-1106	2.1	2.1	58.4	62.5	82.0	97.0	true
codellama:13b-instruct-q4KM	3.2	3.2	56.4	54.6	56.0	61.0	false
deepseek-coder:33b-instruct-q4KM	46.7	46.7	55.0	50.0	62.0	68.0	false
magicoder	12.8	12.8	53.7	50.0	49.0	52.0	false
nous-hermes2:34b-yi-q4KM	56.8	56.8	50.7	50.0	78.0	56.0	false
accounts/fireworks/models/dbrx-instruct	3.7	3.7	50.0	50.0	121.0	75.0	false
codellama:13b-instruct	18.1	18.1	50.0	50.0	65.0	44.0	false
openchat:7b-v3.5-1210-q4KM	14.4	14.4	49.4	50.0	48.0	23.0	false
openhermes2.5-mistral	12.9	12.9	48.9	50.0	55.0	27.0	false
starling-lm:latest	13.7	13.7	48.4	50.0	58.0	26.0	false
codellama:7b-instruct-q4KM	2.1	2.1	47.8	50.0	95.0	38.0	false
mistral-tiny	4.6	4.6	46.9	50.0	75.0	42.0	true
yi:34b-chat	43.9	43.9	45.6	50.0	45.0	34.0	false
mistral:7b-instruct-v0.2-q6_K	21.7	21.7	45.4	50.0	44.0	23.0	false
mistral:7b-instruct-v0.2-q4_0	12.4	12.4	44.3	50.0	75.0	32.0	false
mistral:7b-instruct-v0.2-q4KM	15.6	15.6	42.6	50.0	71.0	23.0	false
gpt-3.5-turbo	3.6	3.6	42.3	50.0	132.0	54.0	true
codellama:34b-instruct-q4KM	7.5	7.5	39.7	50.0	127.0	35.0	false
codellama:70b-instruct-q4KM	16.3	16.3	36.4	0.0	179.0	58.0	false
gemini-1.0-pro-latest	4.2	4.2	35.9	50.0	76.0	9.0	true
solar:10.7b-instruct-v1-q4KM	18.8	18.8	35.2	50.0	107.0	10.0	false
mistral:7b-instruct-q4KM	13.9	13.9	34.8	50.0	80.0	0.0	false
codellama:70b-instruct-q2_K	11.2	11.2	29.8	0.0	198.0	29.0	false
llama2	17.1	17.1	26.5	25.0	131.0	0.0	false
gemma:7b-instruct-q6_K	20.9	20.9	25.9	25.0	147.0	2.0	false
orca2:13b	20.1	20.1	23.1	0.0	166.0	11.0	false
stablelm-zephyr	9.9	9.9	15.4	0.0	192.0	1.0	false
dolphin-phi:2.7b-v2.6-q6_K	8.9	8.9	14.9	0.0	188.0	0.0	false
codellama:13b-python	12.5	12.5	12.8	0.0	155.0	0.0	false
phi:2.7b-chat-v2-q6_K	13.0	13.0	8.9	0.0	222.0	0.0	false

This page was generated using Literate.jl.