26b is not enough

Today I ran gemma4:26b in a rented VPS, because SaaS LLM industry is unhealthy for privacy.

The process was pretty simple, I chosen a RTX 5070 Ti by $0.120/hr and installed Ollama, then forwarding the port via SSH, I was able to use it in opencode:¹

ssh -p $PORT root@$IP -L 11434:localhost:11434

The gemma4:31b model had a slow performance, so I used gemma4:26b because it is lighter. But the usability in opencode is almost none. Even a simple task of using tools to identify the intention of the repo wasn't something it could do.

This is disappointing, makes me think this technology is still unusable, in early stages. The cost to run locally is absurdly high, and the LLM SaaS companies sells cheaply.²

The full specs of the VM:

{
  "id": 32467452,
  "ask_contract_id": 32467452,
  "bundle_id": 1780500843,
  "bundled_results": null,
  "bw_nvlink": 0.0,
  "compute_cap": 1200,
  "cpu_arch": "amd64",
  "cpu_cores": 20,
  "cpu_cores_effective": 10.0,
  "cpu_ghz": 8.5,
  "cpu_name": "Core™ i5-14600KF",
  "cpu_ram": 32066,
  "credit_discount_max": 0.0,
  "cuda_max_good": 12.8,
  "direct_port_count": 498,
  "disk_bw": 7155.0,
  "disk_name": "GIGABYTE GP-ASM2NE6200TTTD",
  "disk_space": 1215.9863182468443,
  "dlperf": 76.18168211918344,
  "dlperf_per_dphtotal": 657.9991737741373,
  "dph_base": 0.11466666666666665,
  "dph_total": 0.11577777777777776,
  "driver_version": "570.133.07",
  "driver_vers": 570133007,
  "duration": 148851979.30196834,
  "end_date": 1924300800.0,
  "external": null,
  "flops_per_dphtotal": 386.3290595009598,
  "geolocation": "Vietnam, VN",
  "geolocode": 1895214017,
  "gpu_arch": "nvidia",
  "gpu_display_active": false,
  "gpu_frac": 0.5,
  "gpu_ids": [
    126820
  ],
  "gpu_lanes": 8,
  "gpu_mem_bw": 734.6,
  "gpu_name": "RTX 5070 Ti",
  "gpu_ram": 16303,
  "gpu_total_ram": 16303,
  "gpu_max_power": 300.0,
  "gpu_max_temp": 26.0,
  "has_avx": 1,
  "host_id": 55116,
  "hosting_type": 0,
  "hostname": null,
  "inet_down": 869.8,
  "inet_down_cost": 0.02666666666666667,
  "inet_up": 877.5,
  "inet_up_cost": 0.02666666666666667,
  "is_bid": false,
  "logo": "/static/logos/vastai_small2.png",
  "machine_id": 34704,
  "min_bid": 0.09333333333333334,
  "mobo_name": "Z690A VALKYRIE",
  "num_gpus": 1,
  "os_version": "22.04",
  "pci_gen": 4.0,
  "pcie_bw": 12.8,
  "public_ipaddr": "171.248.251.59",
  "reliability": 0.9977057,
  "reliability_mult": 0.9775616,
  "rentable": true,
  "rented": false,
  "score": 349.1368678623094,
  "start_date": 1775437087.451604,
  "static_ip": false,
  "storage_cost": 0.09999999999999999,
  "storage_total_cost": 0.0011111111111111111,
  "total_flops": 44.728320000000004,
  "verification": "verified",
  "vericode": 1,
  "vram_costperhour": 0.007202273610173996,
  "webpage": null,
  "vms_enabled": false,
  "expected_reliability": 0.0,
  "is_vm_deverified": true,
  "resource_type": "gpu",
  "cluster_id": null,
  "avail_vol_ask_id": 32467453,
  "avail_vol_dph": 0.0001388888888888889,
  "avail_vol_size": 1517.0,
  "nw_disk_min_bw": null,
  "nw_disk_max_bw": null,
  "nw_disk_avg_bw": null,
  "rn": 1,
  "dph_total_adj": 0.1691111111111111,
  "reliability2": 0.9977057,
  "discount_rate": 0.0,
  "discounted_hourly": 0.0,
  "discounted_dph_total": 0.11577777777777776,
  "search": {
    "gpuCostPerHour": 0.11466666666666665,
    "diskHour": 0.0011111111111111111,
    "totalHour": 0.11577777777777776,
    "discountTotalHour": 0,
    "discountedTotalPerHour": 0.11577777777777776
  },
  "instance": {
    "gpuCostPerHour": 0,
    "diskHour": 0.0011111111111111111,
    "totalHour": 0.0011111111111111111,
    "discountTotalHour": 0,
    "discountedTotalPerHour": 0.0011111111111111111
  },
  "time_remaining": "",
  "time_remaining_isbid": "",
  "internet_up_cost_per_tb": 27.30666666666667,
  "internet_down_cost_per_tb": 27.30666666666667
}

Footnotes:

Config in my dotfiles

Do you find GPU renting worth it for a LocalLLM? : LocalLLaMA