Merge pull request #1869 from Gustav-Simonsson/gpu_miner

all: Add GPU mining, disabled by default
9 years ago · d5327ddc5f
parent b747754009 ec6a548ee3
commit d5327ddc5f
42 changed files with 6764 additions and 42 deletions
--- a/Godeps/Godeps.json
+++ b/Godeps/Godeps.json
@ -16,8 +16,8 @@
 		},
 		{
 			"ImportPath": "github.com/ethereum/ethash",
-			"Comment": "v23.1-234-g062e40a",
-			"Rev": "062e40a1a1671f5a5102862b56e4c56f68a732f5"
+			"Comment": "v23.1-235-gb39e007",
+			"Rev": "b39e007d393ab5945b4c0748a7415b7e31c5db04"
 		},
 		{
 			"ImportPath": "github.com/fatih/color",
--- a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/cl.go
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/cl.go
@ -0,0 +1,26 @@
+/*
+Package cl provides a binding to the OpenCL api. It's mostly a low-level
+wrapper that avoids adding functionality while still making the interface
+a little more friendly and easy to use.
+
+Resource life-cycle management:
+
+For any CL object that gets created (buffer, queue, kernel, etc..) you should
+call object.Release() when finished with it to free the CL resources. This
+explicitely calls the related clXXXRelease method for the type. However,
+as a fallback there is a finalizer set for every resource item that takes
+care of it (eventually) if Release isn't called. In this way you can have
+better control over the life cycle of resources while having a fall back
+to avoid leaks. This is similar to how file handles and such are handled
+in the Go standard packages.
+*/
+package cl
+
+// #include "headers/1.2/opencl.h"
+// #cgo CFLAGS: -Iheaders/1.2
+// #cgo darwin LDFLAGS: -framework OpenCL
+// #cgo linux LDFLAGS: -lOpenCL
+import "C"
+import "errors"
+
+var ErrUnsupported = errors.New("cl: unsupported")
--- a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/cl_test.go
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/cl_test.go
@ -0,0 +1,254 @@
+package cl
+
+import (
+	"math/rand"
+	"reflect"
+	"strings"
+	"testing"
+)
+
+var kernelSource = `
+__kernel void square(
+   __global float* input,
+   __global float* output,
+   const unsigned int count)
+{
+   int i = get_global_id(0);
+   if(i < count)
+       output[i] = input[i] * input[i];
+}
+`
+
+func getObjectStrings(object interface{}) map[string]string {
+	v := reflect.ValueOf(object)
+	t := reflect.TypeOf(object)
+
+	strs := make(map[string]string)
+
+	numMethods := t.NumMethod()
+	for i := 0; i < numMethods; i++ {
+		method := t.Method(i)
+		if method.Type.NumIn() == 1 && method.Type.NumOut() == 1 && method.Type.Out(0).Kind() == reflect.String {
+			// this is a string-returning method with (presumably) only a pointer receiver parameter
+			// call it
+			outs := v.Method(i).Call([]reflect.Value{})
+			// put the result in our map
+			strs[method.Name] = (outs[0].Interface()).(string)
+		}
+	}
+
+	return strs
+}
+
+func TestPlatformStringsContainNoNULs(t *testing.T) {
+	platforms, err := GetPlatforms()
+	if err != nil {
+		t.Fatalf("Failed to get platforms: %+v", err)
+	}
+
+	for _, p := range platforms {
+		for key, value := range getObjectStrings(p) {
+			if strings.Contains(value, "\x00") {
+				t.Fatalf("platform string %q =  %+q contains NUL", key, value)
+			}
+		}
+	}
+}
+
+func TestDeviceStringsContainNoNULs(t *testing.T) {
+	platforms, err := GetPlatforms()
+	if err != nil {
+		t.Fatalf("Failed to get platforms: %+v", err)
+	}
+
+	for _, p := range platforms {
+		devs, err := p.GetDevices(DeviceTypeAll)
+		if err != nil {
+			t.Fatalf("Failed to get devices for platform %q: %+v", p.Name(), err)
+		}
+
+		for _, d := range devs {
+			for key, value := range getObjectStrings(d) {
+				if strings.Contains(value, "\x00") {
+					t.Fatalf("device string %q =  %+q contains NUL", key, value)
+				}
+			}
+		}
+	}
+}
+
+func TestHello(t *testing.T) {
+	var data [1024]float32
+	for i := 0; i < len(data); i++ {
+		data[i] = rand.Float32()
+	}
+
+	platforms, err := GetPlatforms()
+	if err != nil {
+		t.Fatalf("Failed to get platforms: %+v", err)
+	}
+	for i, p := range platforms {
+		t.Logf("Platform %d:", i)
+		t.Logf("  Name: %s", p.Name())
+		t.Logf("  Vendor: %s", p.Vendor())
+		t.Logf("  Profile: %s", p.Profile())
+		t.Logf("  Version: %s", p.Version())
+		t.Logf("  Extensions: %s", p.Extensions())
+	}
+	platform := platforms[0]
+
+	devices, err := platform.GetDevices(DeviceTypeAll)
+	if err != nil {
+		t.Fatalf("Failed to get devices: %+v", err)
+	}
+	if len(devices) == 0 {
+		t.Fatalf("GetDevices returned no devices")
+	}
+	deviceIndex := -1
+	for i, d := range devices {
+		if deviceIndex < 0 && d.Type() == DeviceTypeGPU {
+			deviceIndex = i
+		}
+		t.Logf("Device %d (%s): %s", i, d.Type(), d.Name())
+		t.Logf("  Address Bits: %d", d.AddressBits())
+		t.Logf("  Available: %+v", d.Available())
+		// t.Logf("  Built-In Kernels: %s", d.BuiltInKernels())
+		t.Logf("  Compiler Available: %+v", d.CompilerAvailable())
+		t.Logf("  Double FP Config: %s", d.DoubleFPConfig())
+		t.Logf("  Driver Version: %s", d.DriverVersion())
+		t.Logf("  Error Correction Supported: %+v", d.ErrorCorrectionSupport())
+		t.Logf("  Execution Capabilities: %s", d.ExecutionCapabilities())
+		t.Logf("  Extensions: %s", d.Extensions())
+		t.Logf("  Global Memory Cache Type: %s", d.GlobalMemCacheType())
+		t.Logf("  Global Memory Cacheline Size: %d KB", d.GlobalMemCachelineSize()/1024)
+		t.Logf("  Global Memory Size: %d MB", d.GlobalMemSize()/(1024*1024))
+		t.Logf("  Half FP Config: %s", d.HalfFPConfig())
+		t.Logf("  Host Unified Memory: %+v", d.HostUnifiedMemory())
+		t.Logf("  Image Support: %+v", d.ImageSupport())
+		t.Logf("  Image2D Max Dimensions: %d x %d", d.Image2DMaxWidth(), d.Image2DMaxHeight())
+		t.Logf("  Image3D Max Dimenionns: %d x %d x %d", d.Image3DMaxWidth(), d.Image3DMaxHeight(), d.Image3DMaxDepth())
+		// t.Logf("  Image Max Buffer Size: %d", d.ImageMaxBufferSize())
+		// t.Logf("  Image Max Array Size: %d", d.ImageMaxArraySize())
+		// t.Logf("  Linker Available: %+v", d.LinkerAvailable())
+		t.Logf("  Little Endian: %+v", d.EndianLittle())
+		t.Logf("  Local Mem Size Size: %d KB", d.LocalMemSize()/1024)
+		t.Logf("  Local Mem Type: %s", d.LocalMemType())
+		t.Logf("  Max Clock Frequency: %d", d.MaxClockFrequency())
+		t.Logf("  Max Compute Units: %d", d.MaxComputeUnits())
+		t.Logf("  Max Constant Args: %d", d.MaxConstantArgs())
+		t.Logf("  Max Constant Buffer Size: %d KB", d.MaxConstantBufferSize()/1024)
+		t.Logf("  Max Mem Alloc Size: %d KB", d.MaxMemAllocSize()/1024)
+		t.Logf("  Max Parameter Size: %d", d.MaxParameterSize())
+		t.Logf("  Max Read-Image Args: %d", d.MaxReadImageArgs())
+		t.Logf("  Max Samplers: %d", d.MaxSamplers())
+		t.Logf("  Max Work Group Size: %d", d.MaxWorkGroupSize())
+		t.Logf("  Max Work Item Dimensions: %d", d.MaxWorkItemDimensions())
+		t.Logf("  Max Work Item Sizes: %d", d.MaxWorkItemSizes())
+		t.Logf("  Max Write-Image Args: %d", d.MaxWriteImageArgs())
+		t.Logf("  Memory Base Address Alignment: %d", d.MemBaseAddrAlign())
+		t.Logf("  Native Vector Width Char: %d", d.NativeVectorWidthChar())
+		t.Logf("  Native Vector Width Short: %d", d.NativeVectorWidthShort())
+		t.Logf("  Native Vector Width Int: %d", d.NativeVectorWidthInt())
+		t.Logf("  Native Vector Width Long: %d", d.NativeVectorWidthLong())
+		t.Logf("  Native Vector Width Float: %d", d.NativeVectorWidthFloat())
+		t.Logf("  Native Vector Width Double: %d", d.NativeVectorWidthDouble())
+		t.Logf("  Native Vector Width Half: %d", d.NativeVectorWidthHalf())
+		t.Logf("  OpenCL C Version: %s", d.OpenCLCVersion())
+		// t.Logf("  Parent Device: %+v", d.ParentDevice())
+		t.Logf("  Profile: %s", d.Profile())
+		t.Logf("  Profiling Timer Resolution: %d", d.ProfilingTimerResolution())
+		t.Logf("  Vendor: %s", d.Vendor())
+		t.Logf("  Version: %s", d.Version())
+	}
+	if deviceIndex < 0 {
+		deviceIndex = 0
+	}
+	device := devices[deviceIndex]
+	t.Logf("Using device %d", deviceIndex)
+	context, err := CreateContext([]*Device{device})
+	if err != nil {
+		t.Fatalf("CreateContext failed: %+v", err)
+	}
+	// imageFormats, err := context.GetSupportedImageFormats(0, MemObjectTypeImage2D)
+	// if err != nil {
+	// 	t.Fatalf("GetSupportedImageFormats failed: %+v", err)
+	// }
+	// t.Logf("Supported image formats: %+v", imageFormats)
+	queue, err := context.CreateCommandQueue(device, 0)
+	if err != nil {
+		t.Fatalf("CreateCommandQueue failed: %+v", err)
+	}
+	program, err := context.CreateProgramWithSource([]string{kernelSource})
+	if err != nil {
+		t.Fatalf("CreateProgramWithSource failed: %+v", err)
+	}
+	if err := program.BuildProgram(nil, ""); err != nil {
+		t.Fatalf("BuildProgram failed: %+v", err)
+	}
+	kernel, err := program.CreateKernel("square")
+	if err != nil {
+		t.Fatalf("CreateKernel failed: %+v", err)
+	}
+	for i := 0; i < 3; i++ {
+		name, err := kernel.ArgName(i)
+		if err == ErrUnsupported {
+			break
+		} else if err != nil {
+			t.Errorf("GetKernelArgInfo for name failed: %+v", err)
+			break
+		} else {
+			t.Logf("Kernel arg %d: %s", i, name)
+		}
+	}
+	input, err := context.CreateEmptyBuffer(MemReadOnly, 4*len(data))
+	if err != nil {
+		t.Fatalf("CreateBuffer failed for input: %+v", err)
+	}
+	output, err := context.CreateEmptyBuffer(MemReadOnly, 4*len(data))
+	if err != nil {
+		t.Fatalf("CreateBuffer failed for output: %+v", err)
+	}
+	if _, err := queue.EnqueueWriteBufferFloat32(input, true, 0, data[:], nil); err != nil {
+		t.Fatalf("EnqueueWriteBufferFloat32 failed: %+v", err)
+	}
+	if err := kernel.SetArgs(input, output, uint32(len(data))); err != nil {
+		t.Fatalf("SetKernelArgs failed: %+v", err)
+	}
+
+	local, err := kernel.WorkGroupSize(device)
+	if err != nil {
+		t.Fatalf("WorkGroupSize failed: %+v", err)
+	}
+	t.Logf("Work group size: %d", local)
+	size, _ := kernel.PreferredWorkGroupSizeMultiple(nil)
+	t.Logf("Preferred Work Group Size Multiple: %d", size)
+
+	global := len(data)
+	d := len(data) % local
+	if d != 0 {
+		global += local - d
+	}
+	if _, err := queue.EnqueueNDRangeKernel(kernel, nil, []int{global}, []int{local}, nil); err != nil {
+		t.Fatalf("EnqueueNDRangeKernel failed: %+v", err)
+	}
+
+	if err := queue.Finish(); err != nil {
+		t.Fatalf("Finish failed: %+v", err)
+	}
+
+	results := make([]float32, len(data))
+	if _, err := queue.EnqueueReadBufferFloat32(output, true, 0, results, nil); err != nil {
+		t.Fatalf("EnqueueReadBufferFloat32 failed: %+v", err)
+	}
+
+	correct := 0
+	for i, v := range data {
+		if results[i] == v*v {
+			correct++
+		}
+	}
+
+	if correct != len(data) {
+		t.Fatalf("%d/%d correct values", correct, len(data))
+	}
+}
--- a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/context.go
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/context.go
@ -0,0 +1,161 @@
+package cl
+
+// #include <stdlib.h>
+// #ifdef __APPLE__
+// #include "OpenCL/opencl.h"
+// #else
+// #include "cl.h"
+// #endif
+import "C"
+
+import (
+	"runtime"
+	"unsafe"
+)
+
+const maxImageFormats = 256
+
+type Context struct {
+	clContext C.cl_context
+	devices   []*Device
+}
+
+type MemObject struct {
+	clMem C.cl_mem
+	size  int
+}
+
+func releaseContext(c *Context) {
+	if c.clContext != nil {
+		C.clReleaseContext(c.clContext)
+		c.clContext = nil
+	}
+}
+
+func releaseMemObject(b *MemObject) {
+	if b.clMem != nil {
+		C.clReleaseMemObject(b.clMem)
+		b.clMem = nil
+	}
+}
+
+func newMemObject(mo C.cl_mem, size int) *MemObject {
+	memObject := &MemObject{clMem: mo, size: size}
+	runtime.SetFinalizer(memObject, releaseMemObject)
+	return memObject
+}
+
+func (b *MemObject) Release() {
+	releaseMemObject(b)
+}
+
+// TODO: properties
+func CreateContext(devices []*Device) (*Context, error) {
+	deviceIds := buildDeviceIdList(devices)
+	var err C.cl_int
+	clContext := C.clCreateContext(nil, C.cl_uint(len(devices)), &deviceIds[0], nil, nil, &err)
+	if err != C.CL_SUCCESS {
+		return nil, toError(err)
+	}
+	if clContext == nil {
+		return nil, ErrUnknown
+	}
+	context := &Context{clContext: clContext, devices: devices}
+	runtime.SetFinalizer(context, releaseContext)
+	return context, nil
+}
+
+func (ctx *Context) GetSupportedImageFormats(flags MemFlag, imageType MemObjectType) ([]ImageFormat, error) {
+	var formats [maxImageFormats]C.cl_image_format
+	var nFormats C.cl_uint
+	if err := C.clGetSupportedImageFormats(ctx.clContext, C.cl_mem_flags(flags), C.cl_mem_object_type(imageType), maxImageFormats, &formats[0], &nFormats); err != C.CL_SUCCESS {
+		return nil, toError(err)
+	}
+	fmts := make([]ImageFormat, nFormats)
+	for i, f := range formats[:nFormats] {
+		fmts[i] = ImageFormat{
+			ChannelOrder:    ChannelOrder(f.image_channel_order),
+			ChannelDataType: ChannelDataType(f.image_channel_data_type),
+		}
+	}
+	return fmts, nil
+}
+
+func (ctx *Context) CreateCommandQueue(device *Device, properties CommandQueueProperty) (*CommandQueue, error) {
+	var err C.cl_int
+	clQueue := C.clCreateCommandQueue(ctx.clContext, device.id, C.cl_command_queue_properties(properties), &err)
+	if err != C.CL_SUCCESS {
+		return nil, toError(err)
+	}
+	if clQueue == nil {
+		return nil, ErrUnknown
+	}
+	commandQueue := &CommandQueue{clQueue: clQueue, device: device}
+	runtime.SetFinalizer(commandQueue, releaseCommandQueue)
+	return commandQueue, nil
+}
+
+func (ctx *Context) CreateProgramWithSource(sources []string) (*Program, error) {
+	cSources := make([]*C.char, len(sources))
+	for i, s := range sources {
+		cs := C.CString(s)
+		cSources[i] = cs
+		defer C.free(unsafe.Pointer(cs))
+	}
+	var err C.cl_int
+	clProgram := C.clCreateProgramWithSource(ctx.clContext, C.cl_uint(len(sources)), &cSources[0], nil, &err)
+	if err != C.CL_SUCCESS {
+		return nil, toError(err)
+	}
+	if clProgram == nil {
+		return nil, ErrUnknown
+	}
+	program := &Program{clProgram: clProgram, devices: ctx.devices}
+	runtime.SetFinalizer(program, releaseProgram)
+	return program, nil
+}
+
+func (ctx *Context) CreateBufferUnsafe(flags MemFlag, size int, dataPtr unsafe.Pointer) (*MemObject, error) {
+	var err C.cl_int
+	clBuffer := C.clCreateBuffer(ctx.clContext, C.cl_mem_flags(flags), C.size_t(size), dataPtr, &err)
+	if err != C.CL_SUCCESS {
+		return nil, toError(err)
+	}
+	if clBuffer == nil {
+		return nil, ErrUnknown
+	}
+	return newMemObject(clBuffer, size), nil
+}
+
+func (ctx *Context) CreateEmptyBuffer(flags MemFlag, size int) (*MemObject, error) {
+	return ctx.CreateBufferUnsafe(flags, size, nil)
+}
+
+func (ctx *Context) CreateEmptyBufferFloat32(flags MemFlag, size int) (*MemObject, error) {
+	return ctx.CreateBufferUnsafe(flags, 4*size, nil)
+}
+
+func (ctx *Context) CreateBuffer(flags MemFlag, data []byte) (*MemObject, error) {
+	return ctx.CreateBufferUnsafe(flags, len(data), unsafe.Pointer(&data[0]))
+}
+
+//float64
+func (ctx *Context) CreateBufferFloat32(flags MemFlag, data []float32) (*MemObject, error) {
+	return ctx.CreateBufferUnsafe(flags, 4*len(data), unsafe.Pointer(&data[0]))
+}
+
+func (ctx *Context) CreateUserEvent() (*Event, error) {
+	var err C.cl_int
+	clEvent := C.clCreateUserEvent(ctx.clContext, &err)
+	if err != C.CL_SUCCESS {
+		return nil, toError(err)
+	}
+	return newEvent(clEvent), nil
+}
+
+func (ctx *Context) Release() {
+	releaseContext(ctx)
+}
+
+// http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clCreateSubBuffer.html
+// func (memObject *MemObject) CreateSubBuffer(flags MemFlag, bufferCreateType BufferCreateType, )
--- a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/device.go
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/device.go
@ -0,0 +1,510 @@
+package cl
+
+// #ifdef __APPLE__
+// #include "OpenCL/opencl.h"
+// #else
+// #include "cl.h"
+// #include "cl_ext.h"
+// #endif
+import "C"
+
+import (
+	"strings"
+	"unsafe"
+)
+
+const maxDeviceCount = 64
+
+type DeviceType uint
+
+const (
+	DeviceTypeCPU         DeviceType = C.CL_DEVICE_TYPE_CPU
+	DeviceTypeGPU         DeviceType = C.CL_DEVICE_TYPE_GPU
+	DeviceTypeAccelerator DeviceType = C.CL_DEVICE_TYPE_ACCELERATOR
+	DeviceTypeDefault     DeviceType = C.CL_DEVICE_TYPE_DEFAULT
+	DeviceTypeAll         DeviceType = C.CL_DEVICE_TYPE_ALL
+)
+
+type FPConfig int
+
+const (
+	FPConfigDenorm         FPConfig = C.CL_FP_DENORM           // denorms are supported
+	FPConfigInfNaN         FPConfig = C.CL_FP_INF_NAN          // INF and NaNs are supported
+	FPConfigRoundToNearest FPConfig = C.CL_FP_ROUND_TO_NEAREST // round to nearest even rounding mode supported
+	FPConfigRoundToZero    FPConfig = C.CL_FP_ROUND_TO_ZERO    // round to zero rounding mode supported
+	FPConfigRoundToInf     FPConfig = C.CL_FP_ROUND_TO_INF     // round to positive and negative infinity rounding modes supported
+	FPConfigFMA            FPConfig = C.CL_FP_FMA              // IEEE754-2008 fused multiply-add is supported
+	FPConfigSoftFloat      FPConfig = C.CL_FP_SOFT_FLOAT       // Basic floating-point operations (such as addition, subtraction, multiplication) are implemented in software
+)
+
+var fpConfigNameMap = map[FPConfig]string{
+	FPConfigDenorm:         "Denorm",
+	FPConfigInfNaN:         "InfNaN",
+	FPConfigRoundToNearest: "RoundToNearest",
+	FPConfigRoundToZero:    "RoundToZero",
+	FPConfigRoundToInf:     "RoundToInf",
+	FPConfigFMA:            "FMA",
+	FPConfigSoftFloat:      "SoftFloat",
+}
+
+func (c FPConfig) String() string {
+	var parts []string
+	for bit, name := range fpConfigNameMap {
+		if c&bit != 0 {
+			parts = append(parts, name)
+		}
+	}
+	if parts == nil {
+		return ""
+	}
+	return strings.Join(parts, "|")
+}
+
+func (dt DeviceType) String() string {
+	var parts []string
+	if dt&DeviceTypeCPU != 0 {
+		parts = append(parts, "CPU")
+	}
+	if dt&DeviceTypeGPU != 0 {
+		parts = append(parts, "GPU")
+	}
+	if dt&DeviceTypeAccelerator != 0 {
+		parts = append(parts, "Accelerator")
+	}
+	if dt&DeviceTypeDefault != 0 {
+		parts = append(parts, "Default")
+	}
+	if parts == nil {
+		parts = append(parts, "None")
+	}
+	return strings.Join(parts, "|")
+}
+
+type Device struct {
+	id C.cl_device_id
+}
+
+func buildDeviceIdList(devices []*Device) []C.cl_device_id {
+	deviceIds := make([]C.cl_device_id, len(devices))
+	for i, d := range devices {
+		deviceIds[i] = d.id
+	}
+	return deviceIds
+}
+
+// Obtain the list of devices available on a platform. 'platform' refers
+// to the platform returned by GetPlatforms or can be nil. If platform
+// is nil, the behavior is implementation-defined.
+func GetDevices(platform *Platform, deviceType DeviceType) ([]*Device, error) {
+	var deviceIds [maxDeviceCount]C.cl_device_id
+	var numDevices C.cl_uint
+	var platformId C.cl_platform_id
+	if platform != nil {
+		platformId = platform.id
+	}
+	if err := C.clGetDeviceIDs(platformId, C.cl_device_type(deviceType), C.cl_uint(maxDeviceCount), &deviceIds[0], &numDevices); err != C.CL_SUCCESS {
+		return nil, toError(err)
+	}
+	if numDevices > maxDeviceCount {
+		numDevices = maxDeviceCount
+	}
+	devices := make([]*Device, numDevices)
+	for i := 0; i < int(numDevices); i++ {
+		devices[i] = &Device{id: deviceIds[i]}
+	}
+	return devices, nil
+}
+
+func (d *Device) nullableId() C.cl_device_id {
+	if d == nil {
+		return nil
+	}
+	return d.id
+}
+
+func (d *Device) GetInfoString(param C.cl_device_info, panicOnError bool) (string, error) {
+	var strC [1024]C.char
+	var strN C.size_t
+	if err := C.clGetDeviceInfo(d.id, param, 1024, unsafe.Pointer(&strC), &strN); err != C.CL_SUCCESS {
+		if panicOnError {
+			panic("Should never fail")
+		}
+		return "", toError(err)
+	}
+
+	// OpenCL strings are NUL-terminated, and the terminator is included in strN
+	// Go strings aren't NUL-terminated, so subtract 1 from the length
+	return C.GoStringN((*C.char)(unsafe.Pointer(&strC)), C.int(strN-1)), nil
+}
+
+func (d *Device) getInfoUint(param C.cl_device_info, panicOnError bool) (uint, error) {
+	var val C.cl_uint
+	if err := C.clGetDeviceInfo(d.id, param, C.size_t(unsafe.Sizeof(val)), unsafe.Pointer(&val), nil); err != C.CL_SUCCESS {
+		if panicOnError {
+			panic("Should never fail")
+		}
+		return 0, toError(err)
+	}
+	return uint(val), nil
+}
+
+func (d *Device) getInfoSize(param C.cl_device_info, panicOnError bool) (int, error) {
+	var val C.size_t
+	if err := C.clGetDeviceInfo(d.id, param, C.size_t(unsafe.Sizeof(val)), unsafe.Pointer(&val), nil); err != C.CL_SUCCESS {
+		if panicOnError {
+			panic("Should never fail")
+		}
+		return 0, toError(err)
+	}
+	return int(val), nil
+}
+
+func (d *Device) getInfoUlong(param C.cl_device_info, panicOnError bool) (int64, error) {
+	var val C.cl_ulong
+	if err := C.clGetDeviceInfo(d.id, param, C.size_t(unsafe.Sizeof(val)), unsafe.Pointer(&val), nil); err != C.CL_SUCCESS {
+		if panicOnError {
+			panic("Should never fail")
+		}
+		return 0, toError(err)
+	}
+	return int64(val), nil
+}
+
+func (d *Device) getInfoBool(param C.cl_device_info, panicOnError bool) (bool, error) {
+	var val C.cl_bool
+	if err := C.clGetDeviceInfo(d.id, param, C.size_t(unsafe.Sizeof(val)), unsafe.Pointer(&val), nil); err != C.CL_SUCCESS {
+		if panicOnError {
+			panic("Should never fail")
+		}
+		return false, toError(err)
+	}
+	return val == C.CL_TRUE, nil
+}
+
+func (d *Device) Name() string {
+	str, _ := d.GetInfoString(C.CL_DEVICE_NAME, true)
+	return str
+}
+
+func (d *Device) Vendor() string {
+	str, _ := d.GetInfoString(C.CL_DEVICE_VENDOR, true)
+	return str
+}
+
+func (d *Device) Extensions() string {
+	str, _ := d.GetInfoString(C.CL_DEVICE_EXTENSIONS, true)
+	return str
+}
+
+func (d *Device) OpenCLCVersion() string {
+	str, _ := d.GetInfoString(C.CL_DEVICE_OPENCL_C_VERSION, true)
+	return str
+}
+
+func (d *Device) Profile() string {
+	str, _ := d.GetInfoString(C.CL_DEVICE_PROFILE, true)
+	return str
+}
+
+func (d *Device) Version() string {
+	str, _ := d.GetInfoString(C.CL_DEVICE_VERSION, true)
+	return str
+}
+
+func (d *Device) DriverVersion() string {
+	str, _ := d.GetInfoString(C.CL_DRIVER_VERSION, true)
+	return str
+}
+
+// The default compute device address space size specified as an
+// unsigned integer value in bits. Currently supported values are 32 or 64 bits.
+func (d *Device) AddressBits() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_ADDRESS_BITS, true)
+	return int(val)
+}
+
+// Size of global memory cache line in bytes.
+func (d *Device) GlobalMemCachelineSize() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, true)
+	return int(val)
+}
+
+// Maximum configured clock frequency of the device in MHz.
+func (d *Device) MaxClockFrequency() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_MAX_CLOCK_FREQUENCY, true)
+	return int(val)
+}
+
+// The number of parallel compute units on the OpenCL device.
+// A work-group executes on a single compute unit. The minimum value is 1.
+func (d *Device) MaxComputeUnits() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_MAX_COMPUTE_UNITS, true)
+	return int(val)
+}
+
+// Max number of arguments declared with the __constant qualifier in a kernel.
+// The minimum value is 8 for devices that are not of type CL_DEVICE_TYPE_CUSTOM.
+func (d *Device) MaxConstantArgs() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_MAX_CONSTANT_ARGS, true)
+	return int(val)
+}
+
+// Max number of simultaneous image objects that can be read by a kernel.
+// The minimum value is 128 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE.
+func (d *Device) MaxReadImageArgs() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_MAX_READ_IMAGE_ARGS, true)
+	return int(val)
+}
+
+// Maximum number of samplers that can be used in a kernel. The minimum
+// value is 16 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE. (Also see sampler_t.)
+func (d *Device) MaxSamplers() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_MAX_SAMPLERS, true)
+	return int(val)
+}
+
+// Maximum dimensions that specify the global and local work-item IDs used
+// by the data parallel execution model. (Refer to clEnqueueNDRangeKernel).
+// The minimum value is 3 for devices that are not of type CL_DEVICE_TYPE_CUSTOM.
+func (d *Device) MaxWorkItemDimensions() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, true)
+	return int(val)
+}
+
+// Max number of simultaneous image objects that can be written to by a
+// kernel. The minimum value is 8 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE.
+func (d *Device) MaxWriteImageArgs() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_MAX_WRITE_IMAGE_ARGS, true)
+	return int(val)
+}
+
+// The minimum value is the size (in bits) of the largest OpenCL built-in
+// data type supported by the device (long16 in FULL profile, long16 or
+// int16 in EMBEDDED profile) for devices that are not of type CL_DEVICE_TYPE_CUSTOM.
+func (d *Device) MemBaseAddrAlign() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_MEM_BASE_ADDR_ALIGN, true)
+	return int(val)
+}
+
+func (d *Device) NativeVectorWidthChar() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, true)
+	return int(val)
+}
+
+func (d *Device) NativeVectorWidthShort() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, true)
+	return int(val)
+}
+
+func (d *Device) NativeVectorWidthInt() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, true)
+	return int(val)
+}
+
+func (d *Device) NativeVectorWidthLong() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, true)
+	return int(val)
+}
+
+func (d *Device) NativeVectorWidthFloat() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, true)
+	return int(val)
+}
+
+func (d *Device) NativeVectorWidthDouble() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, true)
+	return int(val)
+}
+
+func (d *Device) NativeVectorWidthHalf() int {
+	val, _ := d.getInfoUint(C.CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, true)
+	return int(val)
+}
+
+// Max height of 2D image in pixels. The minimum value is 8192
+// if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE.
+func (d *Device) Image2DMaxHeight() int {
+	val, _ := d.getInfoSize(C.CL_DEVICE_IMAGE2D_MAX_HEIGHT, true)
+	return int(val)
+}
+
+// Max width of 2D image or 1D image not created from a buffer object in
+// pixels. The minimum value is 8192 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE.
+func (d *Device) Image2DMaxWidth() int {
+	val, _ := d.getInfoSize(C.CL_DEVICE_IMAGE2D_MAX_WIDTH, true)
+	return int(val)
+}
+
+// Max depth of 3D image in pixels. The minimum value is 2048 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE.
+func (d *Device) Image3DMaxDepth() int {
+	val, _ := d.getInfoSize(C.CL_DEVICE_IMAGE3D_MAX_DEPTH, true)
+	return int(val)
+}
+
+// Max height of 3D image in pixels. The minimum value is 2048 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE.
+func (d *Device) Image3DMaxHeight() int {
+	val, _ := d.getInfoSize(C.CL_DEVICE_IMAGE3D_MAX_HEIGHT, true)
+	return int(val)
+}
+
+// Max width of 3D image in pixels. The minimum value is 2048 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE.
+func (d *Device) Image3DMaxWidth() int {
+	val, _ := d.getInfoSize(C.CL_DEVICE_IMAGE3D_MAX_WIDTH, true)
+	return int(val)
+}
+
+// Max size in bytes of the arguments that can be passed to a kernel. The
+// minimum value is 1024 for devices that are not of type CL_DEVICE_TYPE_CUSTOM.
+// For this minimum value, only a maximum of 128 arguments can be passed to a kernel.
+func (d *Device) MaxParameterSize() int {
+	val, _ := d.getInfoSize(C.CL_DEVICE_MAX_PARAMETER_SIZE, true)
+	return int(val)
+}
+
+// Maximum number of work-items in a work-group executing a kernel on a
+// single compute unit, using the data parallel execution model. (Refer
+// to clEnqueueNDRangeKernel). The minimum value is 1.
+func (d *Device) MaxWorkGroupSize() int {
+	val, _ := d.getInfoSize(C.CL_DEVICE_MAX_WORK_GROUP_SIZE, true)
+	return int(val)
+}
+
+// Describes the resolution of device timer. This is measured in nanoseconds.
+func (d *Device) ProfilingTimerResolution() int {
+	val, _ := d.getInfoSize(C.CL_DEVICE_PROFILING_TIMER_RESOLUTION, true)
+	return int(val)
+}
+
+// Size of local memory arena in bytes. The minimum value is 32 KB for
+// devices that are not of type CL_DEVICE_TYPE_CUSTOM.
+func (d *Device) LocalMemSize() int64 {
+	val, _ := d.getInfoUlong(C.CL_DEVICE_LOCAL_MEM_SIZE, true)
+	return val
+}
+
+// Max size in bytes of a constant buffer allocation. The minimum value is
+// 64 KB for devices that are not of type CL_DEVICE_TYPE_CUSTOM.
+func (d *Device) MaxConstantBufferSize() int64 {
+	val, _ := d.getInfoUlong(C.CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, true)
+	return val
+}
+
+// Max size of memory object allocation in bytes. The minimum value is max
+// (1/4th of CL_DEVICE_GLOBAL_MEM_SIZE, 128*1024*1024) for devices that are
+// not of type CL_DEVICE_TYPE_CUSTOM.
+func (d *Device) MaxMemAllocSize() int64 {
+	val, _ := d.getInfoUlong(C.CL_DEVICE_MAX_MEM_ALLOC_SIZE, true)
+	return val
+}
+
+// Size of global device memory in bytes.
+func (d *Device) GlobalMemSize() int64 {
+	val, _ := d.getInfoUlong(C.CL_DEVICE_GLOBAL_MEM_SIZE, true)
+	return val
+}
+
+func (d *Device) Available() bool {
+	val, _ := d.getInfoBool(C.CL_DEVICE_AVAILABLE, true)
+	return val
+}
+
+func (d *Device) CompilerAvailable() bool {
+	val, _ := d.getInfoBool(C.CL_DEVICE_COMPILER_AVAILABLE, true)
+	return val
+}
+
+func (d *Device) EndianLittle() bool {
+	val, _ := d.getInfoBool(C.CL_DEVICE_ENDIAN_LITTLE, true)
+	return val
+}
+
+// Is CL_TRUE if the device implements error correction for all
+// accesses to compute device memory (global and constant). Is
+// CL_FALSE if the device does not implement such error correction.
+func (d *Device) ErrorCorrectionSupport() bool {
+	val, _ := d.getInfoBool(C.CL_DEVICE_ERROR_CORRECTION_SUPPORT, true)
+	return val
+}
+
+func (d *Device) HostUnifiedMemory() bool {
+	val, _ := d.getInfoBool(C.CL_DEVICE_HOST_UNIFIED_MEMORY, true)
+	return val
+}
+
+func (d *Device) ImageSupport() bool {
+	val, _ := d.getInfoBool(C.CL_DEVICE_IMAGE_SUPPORT, true)
+	return val
+}
+
+func (d *Device) Type() DeviceType {
+	var deviceType C.cl_device_type
+	if err := C.clGetDeviceInfo(d.id, C.CL_DEVICE_TYPE, C.size_t(unsafe.Sizeof(deviceType)), unsafe.Pointer(&deviceType), nil); err != C.CL_SUCCESS {
+		panic("Failed to get device type")
+	}
+	return DeviceType(deviceType)
+}
+
+// Describes double precision floating-point capability of the OpenCL device
+func (d *Device) DoubleFPConfig() FPConfig {
+	var fpConfig C.cl_device_fp_config
+	if err := C.clGetDeviceInfo(d.id, C.CL_DEVICE_DOUBLE_FP_CONFIG, C.size_t(unsafe.Sizeof(fpConfig)), unsafe.Pointer(&fpConfig), nil); err != C.CL_SUCCESS {
+		panic("Failed to get double FP config")
+	}
+	return FPConfig(fpConfig)
+}
+
+// Describes the OPTIONAL half precision floating-point capability of the OpenCL device
+func (d *Device) HalfFPConfig() FPConfig {
+	var fpConfig C.cl_device_fp_config
+	err := C.clGetDeviceInfo(d.id, C.CL_DEVICE_HALF_FP_CONFIG, C.size_t(unsafe.Sizeof(fpConfig)), unsafe.Pointer(&fpConfig), nil)
+	if err != C.CL_SUCCESS {
+		return FPConfig(0)
+	}
+	return FPConfig(fpConfig)
+}
+
+// Type of local memory supported. This can be set to CL_LOCAL implying dedicated
+// local memory storage such as SRAM, or CL_GLOBAL. For custom devices, CL_NONE
+// can also be returned indicating no local memory support.
+func (d *Device) LocalMemType() LocalMemType {
+	var memType C.cl_device_local_mem_type
+	if err := C.clGetDeviceInfo(d.id, C.CL_DEVICE_LOCAL_MEM_TYPE, C.size_t(unsafe.Sizeof(memType)), unsafe.Pointer(&memType), nil); err != C.CL_SUCCESS {
+		return LocalMemType(C.CL_NONE)
+	}
+	return LocalMemType(memType)
+}
+
+// Describes the execution capabilities of the device. The mandated minimum capability is CL_EXEC_KERNEL.
+func (d *Device) ExecutionCapabilities() ExecCapability {
+	var execCap C.cl_device_exec_capabilities
+	if err := C.clGetDeviceInfo(d.id, C.CL_DEVICE_EXECUTION_CAPABILITIES, C.size_t(unsafe.Sizeof(execCap)), unsafe.Pointer(&execCap), nil); err != C.CL_SUCCESS {
+		panic("Failed to get execution capabilities")
+	}
+	return ExecCapability(execCap)
+}
+
+func (d *Device) GlobalMemCacheType() MemCacheType {
+	var memType C.cl_device_mem_cache_type
+	if err := C.clGetDeviceInfo(d.id, C.CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, C.size_t(unsafe.Sizeof(memType)), unsafe.Pointer(&memType), nil); err != C.CL_SUCCESS {
+		return MemCacheType(C.CL_NONE)
+	}
+	return MemCacheType(memType)
+}
+
+// Maximum number of work-items that can be specified in each dimension of the work-group to clEnqueueNDRangeKernel.
+//
+// Returns n size_t entries, where n is the value returned by the query for CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS.
+//
+// The minimum value is (1, 1, 1) for devices that are not of type CL_DEVICE_TYPE_CUSTOM.
+func (d *Device) MaxWorkItemSizes() []int {
+	dims := d.MaxWorkItemDimensions()
+	sizes := make([]C.size_t, dims)
+	if err := C.clGetDeviceInfo(d.id, C.CL_DEVICE_MAX_WORK_ITEM_SIZES, C.size_t(int(unsafe.Sizeof(sizes[0]))*dims), unsafe.Pointer(&sizes[0]), nil); err != C.CL_SUCCESS {
+		panic("Failed to get max work item sizes")
+	}
+	intSizes := make([]int, dims)
+	for i, s := range sizes {
+		intSizes[i] = int(s)
+	}
+	return intSizes
+}
--- a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/device12.go
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/device12.go
@ -0,0 +1,51 @@
+// +build cl12
+
+package cl
+
+// #ifdef __APPLE__
+// #include "OpenCL/opencl.h"
+// #else
+// #include "cl.h"
+// #endif
+import "C"
+import "unsafe"
+
+const FPConfigCorrectlyRoundedDivideSqrt FPConfig = C.CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT
+
+func init() {
+	fpConfigNameMap[FPConfigCorrectlyRoundedDivideSqrt] = "CorrectlyRoundedDivideSqrt"
+}
+
+func (d *Device) BuiltInKernels() string {
+	str, _ := d.getInfoString(C.CL_DEVICE_BUILT_IN_KERNELS, true)
+	return str
+}
+
+// Is CL_FALSE if the implementation does not have a linker available. Is CL_TRUE if the linker is available. This can be CL_FALSE for the embedded platform profile only. This must be CL_TRUE if CL_DEVICE_COMPILER_AVAILABLE is CL_TRUE
+func (d *Device) LinkerAvailable() bool {
+	val, _ := d.getInfoBool(C.CL_DEVICE_LINKER_AVAILABLE, true)
+	return val
+}
+
+func (d *Device) ParentDevice() *Device {
+	var deviceId C.cl_device_id
+	if err := C.clGetDeviceInfo(d.id, C.CL_DEVICE_PARENT_DEVICE, C.size_t(unsafe.Sizeof(deviceId)), unsafe.Pointer(&deviceId), nil); err != C.CL_SUCCESS {
+		panic("ParentDevice failed")
+	}
+	if deviceId == nil {
+		return nil
+	}
+	return &Device{id: deviceId}
+}
+
+// Max number of pixels for a 1D image created from a buffer object. The minimum value is 65536 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE.
+func (d *Device) ImageMaxBufferSize() int {
+	val, _ := d.getInfoSize(C.CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, true)
+	return int(val)
+}
+
+// Max number of images in a 1D or 2D image array. The minimum value is 2048 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE
+func (d *Device) ImageMaxArraySize() int {
+	val, _ := d.getInfoSize(C.CL_DEVICE_IMAGE_MAX_ARRAY_SIZE, true)
+	return int(val)
+}
--- a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl.h
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl.h
--- a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_ext.h
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_ext.h
@ -0,0 +1,315 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2013 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11928 $ on $Date: 2010-07-13 09:04:56 -0700 (Tue, 13 Jul 2010) $ */
+
+/* cl_ext.h contains OpenCL extensions which don't have external */
+/* (OpenGL, D3D) dependencies.                                   */
+
+#ifndef __CL_EXT_H
+#define __CL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+#include <AvailabilityMacros.h>
+#endif
+
+#include <cl.h>
+
+/* cl_khr_fp16 extension - no extension #define since it has no functions  */
+#define CL_DEVICE_HALF_FP_CONFIG                    0x1033
+
+/* Memory object destruction
+ *
+ * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
+ *
+ * Registers a user callback function that will be called when the memory object is deleted and its resources 
+ * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback 
+ * stack associated with memobj. The registered user callback functions are called in the reverse order in 
+ * which they were registered. The user callback functions are called and then the memory object is deleted 
+ * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be 
+ * notified when the memory referenced by host_ptr, specified when the memory object is created and used as 
+ * the storage bits for the memory object, can be reused or freed.
+ *
+ * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
+ *
+ * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ */
+#define cl_APPLE_SetMemObjectDestructor 1
+cl_int  CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem /* memobj */, 
+                                        void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
+                                        void * /*user_data */ )             CL_EXT_SUFFIX__VERSION_1_0;  
+
+
+/* Context Logging Functions
+ *
+ * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
+ * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ *
+ * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger 
+ */
+#define cl_APPLE_ContextLoggingFunctions 1
+extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * /* errstr */, 
+                                            const void * /* private_info */, 
+                                            size_t       /* cb */, 
+                                            void *       /* user_data */ )  CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
+extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE(   const char * /* errstr */, 
+                                          const void * /* private_info */, 
+                                          size_t       /* cb */, 
+                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
+extern void CL_API_ENTRY clLogMessagesToStderrAPPLE(   const char * /* errstr */, 
+                                          const void * /* private_info */, 
+                                          size_t       /* cb */, 
+                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/************************ 
+* cl_khr_icd extension *                                                  
+************************/
+#define cl_khr_icd 1
+
+/* cl_platform_info                                                        */
+#define CL_PLATFORM_ICD_SUFFIX_KHR                  0x0920
+
+/* Additional Error Codes                                                  */
+#define CL_PLATFORM_NOT_FOUND_KHR                   -1001
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clIcdGetPlatformIDsKHR(cl_uint          /* num_entries */,
+                       cl_platform_id * /* platforms */,
+                       cl_uint *        /* num_platforms */);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
+    cl_uint          /* num_entries */,
+    cl_platform_id * /* platforms */,
+    cl_uint *        /* num_platforms */);
+
+
+/* Extension: cl_khr_image2D_buffer
+ *
+ * This extension allows a 2D image to be created from a cl_mem buffer without a copy.
+ * The type associated with a 2D image created from a buffer in an OpenCL program is image2d_t.
+ * Both the sampler and sampler-less read_image built-in functions are supported for 2D images
+ * and 2D images created from a buffer.  Similarly, the write_image built-ins are also supported
+ * for 2D images created from a buffer.
+ *
+ * When the 2D image from buffer is created, the client must specify the width,
+ * height, image format (i.e. channel order and channel data type) and optionally the row pitch
+ *
+ * The pitch specified must be a multiple of CL_DEVICE_IMAGE_PITCH_ALIGNMENT pixels.
+ * The base address of the buffer must be aligned to CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT pixels.
+ */
+    
+/*************************************
+ * cl_khr_initalize_memory extension *
+ *************************************/
+    
+#define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x200E
+    
+    
+/**************************************
+ * cl_khr_terminate_context extension *
+ **************************************/
+    
+#define CL_DEVICE_TERMINATE_CAPABILITY_KHR          0x200F
+#define CL_CONTEXT_TERMINATE_KHR                    0x2010
+
+#define cl_khr_terminate_context 1
+extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
+    
+    
+/*
+ * Extension: cl_khr_spir
+ *
+ * This extension adds support to create an OpenCL program object from a 
+ * Standard Portable Intermediate Representation (SPIR) instance
+ */
+
+#define CL_DEVICE_SPIR_VERSIONS                     0x40E0
+#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE         0x40E1
+
+
+/******************************************
+* cl_nv_device_attribute_query extension *
+******************************************/
+/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
+#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
+#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
+#define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
+#define CL_DEVICE_WARP_SIZE_NV                      0x4003
+#define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
+#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
+#define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
+
+/*********************************
+* cl_amd_device_attribute_query *
+*********************************/
+#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD        0x4036
+
+/*********************************
+* cl_arm_printf extension
+*********************************/
+#define CL_PRINTF_CALLBACK_ARM                      0x40B0
+#define CL_PRINTF_BUFFERSIZE_ARM                    0x40B1
+
+#ifdef CL_VERSION_1_1
+   /***********************************
+    * cl_ext_device_fission extension *
+    ***********************************/
+    #define cl_ext_device_fission   1
+    
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
+    
+    typedef CL_API_ENTRY cl_int 
+    (CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
+    
+    typedef CL_API_ENTRY cl_int 
+    (CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    typedef cl_ulong  cl_device_partition_property_ext;
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clCreateSubDevicesEXT(  cl_device_id /*in_device*/,
+                            const cl_device_partition_property_ext * /* properties */,
+                            cl_uint /*num_entries*/,
+                            cl_device_id * /*out_devices*/,
+                            cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    typedef CL_API_ENTRY cl_int 
+    ( CL_API_CALL * clCreateSubDevicesEXT_fn)(  cl_device_id /*in_device*/,
+                                                const cl_device_partition_property_ext * /* properties */,
+                                                cl_uint /*num_entries*/,
+                                                cl_device_id * /*out_devices*/,
+                                                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    /* cl_device_partition_property_ext */
+    #define CL_DEVICE_PARTITION_EQUALLY_EXT             0x4050
+    #define CL_DEVICE_PARTITION_BY_COUNTS_EXT           0x4051
+    #define CL_DEVICE_PARTITION_BY_NAMES_EXT            0x4052
+    #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT  0x4053
+    
+    /* clDeviceGetInfo selectors */
+    #define CL_DEVICE_PARENT_DEVICE_EXT                 0x4054
+    #define CL_DEVICE_PARTITION_TYPES_EXT               0x4055
+    #define CL_DEVICE_AFFINITY_DOMAINS_EXT              0x4056
+    #define CL_DEVICE_REFERENCE_COUNT_EXT               0x4057
+    #define CL_DEVICE_PARTITION_STYLE_EXT               0x4058
+    
+    /* error codes */
+    #define CL_DEVICE_PARTITION_FAILED_EXT              -1057
+    #define CL_INVALID_PARTITION_COUNT_EXT              -1058
+    #define CL_INVALID_PARTITION_NAME_EXT               -1059
+    
+    /* CL_AFFINITY_DOMAINs */
+    #define CL_AFFINITY_DOMAIN_L1_CACHE_EXT             0x1
+    #define CL_AFFINITY_DOMAIN_L2_CACHE_EXT             0x2
+    #define CL_AFFINITY_DOMAIN_L3_CACHE_EXT             0x3
+    #define CL_AFFINITY_DOMAIN_L4_CACHE_EXT             0x4
+    #define CL_AFFINITY_DOMAIN_NUMA_EXT                 0x10
+    #define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT     0x100
+    
+    /* cl_device_partition_property_ext list terminators */
+    #define CL_PROPERTIES_LIST_END_EXT                  ((cl_device_partition_property_ext) 0)
+    #define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)
+    #define CL_PARTITION_BY_NAMES_LIST_END_EXT          ((cl_device_partition_property_ext) 0 - 1)
+
+/*********************************
+* cl_qcom_ext_host_ptr extension
+*********************************/
+
+#define CL_MEM_EXT_HOST_PTR_QCOM                  (1 << 29)
+
+#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM   0x40A0      
+#define CL_DEVICE_PAGE_SIZE_QCOM                  0x40A1
+#define CL_IMAGE_ROW_ALIGNMENT_QCOM               0x40A2
+#define CL_IMAGE_SLICE_ALIGNMENT_QCOM             0x40A3
+#define CL_MEM_HOST_UNCACHED_QCOM                 0x40A4
+#define CL_MEM_HOST_WRITEBACK_QCOM                0x40A5
+#define CL_MEM_HOST_WRITETHROUGH_QCOM             0x40A6
+#define CL_MEM_HOST_WRITE_COMBINING_QCOM          0x40A7
+
+typedef cl_uint                                   cl_image_pitch_info_qcom;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceImageInfoQCOM(cl_device_id             device,
+                         size_t                   image_width,
+                         size_t                   image_height,
+                         const cl_image_format   *image_format,
+                         cl_image_pitch_info_qcom param_name,
+                         size_t                   param_value_size,
+                         void                    *param_value,
+                         size_t                  *param_value_size_ret);
+
+typedef struct _cl_mem_ext_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Legal values will be defined in layered extensions. */
+    cl_uint  allocation_type;
+            
+	/* Host cache policy for this external memory allocation. */
+    cl_uint  host_cache_policy;
+
+} cl_mem_ext_host_ptr;
+
+/*********************************
+* cl_qcom_ion_host_ptr extension
+*********************************/
+
+#define CL_MEM_ION_HOST_PTR_QCOM                  0x40A8
+
+typedef struct _cl_mem_ion_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */
+    cl_mem_ext_host_ptr  ext_host_ptr;
+
+    /* ION file descriptor */
+    int                  ion_filedesc;
+            
+    /* Host pointer to the ION allocated memory */
+    void*                ion_hostptr;
+
+} cl_mem_ion_host_ptr;
+
+#endif /* CL_VERSION_1_1 */
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* __CL_EXT_H */
--- a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_gl.h
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_gl.h
@ -0,0 +1,158 @@
+/**********************************************************************************
+ * Copyright (c) 2008 - 2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+#ifndef __OPENCL_CL_GL_H
+#define __OPENCL_CL_GL_H
+
+#include <cl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef cl_uint     cl_gl_object_type;
+typedef cl_uint     cl_gl_texture_info;
+typedef cl_uint     cl_gl_platform_info;
+typedef struct __GLsync *cl_GLsync;
+
+/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken           */
+#define CL_GL_OBJECT_BUFFER                     0x2000
+#define CL_GL_OBJECT_TEXTURE2D                  0x2001
+#define CL_GL_OBJECT_TEXTURE3D                  0x2002
+#define CL_GL_OBJECT_RENDERBUFFER               0x2003
+#define CL_GL_OBJECT_TEXTURE2D_ARRAY            0x200E
+#define CL_GL_OBJECT_TEXTURE1D                  0x200F
+#define CL_GL_OBJECT_TEXTURE1D_ARRAY            0x2010
+#define CL_GL_OBJECT_TEXTURE_BUFFER             0x2011
+
+/* cl_gl_texture_info           */
+#define CL_GL_TEXTURE_TARGET                    0x2004
+#define CL_GL_MIPMAP_LEVEL                      0x2005
+#define CL_GL_NUM_SAMPLES                       0x2012
+
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLBuffer(cl_context     /* context */,
+                     cl_mem_flags   /* flags */,
+                     cl_GLuint      /* bufobj */,
+                     int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture(cl_context      /* context */,
+                      cl_mem_flags    /* flags */,
+                      cl_GLenum       /* target */,
+                      cl_GLint        /* miplevel */,
+                      cl_GLuint       /* texture */,
+                      cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+    
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLRenderbuffer(cl_context   /* context */,
+                           cl_mem_flags /* flags */,
+                           cl_GLuint    /* renderbuffer */,
+                           cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLObjectInfo(cl_mem                /* memobj */,
+                  cl_gl_object_type *   /* gl_object_type */,
+                  cl_GLuint *           /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
+                  
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLTextureInfo(cl_mem               /* memobj */,
+                   cl_gl_texture_info   /* param_name */,
+                   size_t               /* param_value_size */,
+                   void *               /* param_value */,
+                   size_t *             /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGLObjects(cl_command_queue      /* command_queue */,
+                          cl_uint               /* num_objects */,
+                          const cl_mem *        /* mem_objects */,
+                          cl_uint               /* num_events_in_wait_list */,
+                          const cl_event *      /* event_wait_list */,
+                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGLObjects(cl_command_queue      /* command_queue */,
+                          cl_uint               /* num_objects */,
+                          const cl_mem *        /* mem_objects */,
+                          cl_uint               /* num_events_in_wait_list */,
+                          const cl_event *      /* event_wait_list */,
+                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+
+/* Deprecated OpenCL 1.1 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture2D(cl_context      /* context */,
+                        cl_mem_flags    /* flags */,
+                        cl_GLenum       /* target */,
+                        cl_GLint        /* miplevel */,
+                        cl_GLuint       /* texture */,
+                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture3D(cl_context      /* context */,
+                        cl_mem_flags    /* flags */,
+                        cl_GLenum       /* target */,
+                        cl_GLint        /* miplevel */,
+                        cl_GLuint       /* texture */,
+                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+/* cl_khr_gl_sharing extension  */
+    
+#define cl_khr_gl_sharing 1
+    
+typedef cl_uint     cl_gl_context_info;
+    
+/* Additional Error Codes  */
+#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000
+    
+/* cl_gl_context_info  */
+#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
+#define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
+    
+/* Additional cl_context_properties  */
+#define CL_GL_CONTEXT_KHR                       0x2008
+#define CL_EGL_DISPLAY_KHR                      0x2009
+#define CL_GLX_DISPLAY_KHR                      0x200A
+#define CL_WGL_HDC_KHR                          0x200B
+#define CL_CGL_SHAREGROUP_KHR                   0x200C
+    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLContextInfoKHR(const cl_context_properties * /* properties */,
+                      cl_gl_context_info            /* param_name */,
+                      size_t                        /* param_value_size */,
+                      void *                        /* param_value */,
+                      size_t *                      /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+    
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
+    const cl_context_properties * properties,
+    cl_gl_context_info            param_name,
+    size_t                        param_value_size,
+    void *                        param_value,
+    size_t *                      param_value_size_ret);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_GL_H */
--- a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_gl_ext.h
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_gl_ext.h
@ -0,0 +1,65 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have           */
+/* OpenGL dependencies.                                                         */
+
+#ifndef __OPENCL_CL_GL_EXT_H
+#define __OPENCL_CL_GL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <cl_gl.h>
+
+/*
+ * For each extension, follow this template
+ *  cl_VEN_extname extension  */
+/* #define cl_VEN_extname 1
+ * ... define new types, if any
+ * ... define new tokens, if any
+ * ... define new APIs, if any
+ *
+ *  If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header
+ *  This allows us to avoid having to decide whether to include GL headers or GLES here.
+ */
+
+/* 
+ *  cl_khr_gl_event  extension
+ *  See section 9.9 in the OpenCL 1.1 spec for more information
+ */
+#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromGLsyncKHR(cl_context           /* context */,
+                           cl_GLsync            /* cl_GLsync */,
+                           cl_int *             /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* __OPENCL_CL_GL_EXT_H  */
--- a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_platform.h
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/cl_platform.h
--- a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/opencl.h
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/headers/1.2/opencl.h
@ -0,0 +1,43 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_H
+#define __OPENCL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <cl.h>
+#include <cl_gl.h>
+#include <cl_gl_ext.h>
+#include <cl_ext.h>
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_H   */
+
--- a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/image.go
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/image.go
@ -0,0 +1,83 @@
+// +build cl12
+
+package cl
+
+// #ifdef __APPLE__
+// #include "OpenCL/opencl.h"
+// #else
+// #include "cl.h"
+// #endif
+import "C"
+import (
+	"image"
+	"unsafe"
+)
+
+func (ctx *Context) CreateImage(flags MemFlag, imageFormat ImageFormat, imageDesc ImageDescription, data []byte) (*MemObject, error) {
+	format := imageFormat.toCl()
+	desc := imageDesc.toCl()
+	var dataPtr unsafe.Pointer
+	if data != nil {
+		dataPtr = unsafe.Pointer(&data[0])
+	}
+	var err C.cl_int
+	clBuffer := C.clCreateImage(ctx.clContext, C.cl_mem_flags(flags), &format, &desc, dataPtr, &err)
+	if err != C.CL_SUCCESS {
+		return nil, toError(err)
+	}
+	if clBuffer == nil {
+		return nil, ErrUnknown
+	}
+	return newMemObject(clBuffer, len(data)), nil
+}
+
+func (ctx *Context) CreateImageSimple(flags MemFlag, width, height int, channelOrder ChannelOrder, channelDataType ChannelDataType, data []byte) (*MemObject, error) {
+	format := ImageFormat{channelOrder, channelDataType}
+	desc := ImageDescription{
+		Type:   MemObjectTypeImage2D,
+		Width:  width,
+		Height: height,
+	}
+	return ctx.CreateImage(flags, format, desc, data)
+}
+
+func (ctx *Context) CreateImageFromImage(flags MemFlag, img image.Image) (*MemObject, error) {
+	switch m := img.(type) {
+	case *image.Gray:
+		format := ImageFormat{ChannelOrderIntensity, ChannelDataTypeUNormInt8}
+		desc := ImageDescription{
+			Type:     MemObjectTypeImage2D,
+			Width:    m.Bounds().Dx(),
+			Height:   m.Bounds().Dy(),
+			RowPitch: m.Stride,
+		}
+		return ctx.CreateImage(flags, format, desc, m.Pix)
+	case *image.RGBA:
+		format := ImageFormat{ChannelOrderRGBA, ChannelDataTypeUNormInt8}
+		desc := ImageDescription{
+			Type:     MemObjectTypeImage2D,
+			Width:    m.Bounds().Dx(),
+			Height:   m.Bounds().Dy(),
+			RowPitch: m.Stride,
+		}
+		return ctx.CreateImage(flags, format, desc, m.Pix)
+	}
+
+	b := img.Bounds()
+	w := b.Dx()
+	h := b.Dy()
+	data := make([]byte, w*h*4)
+	dataOffset := 0
+	for y := 0; y < h; y++ {
+		for x := 0; x < w; x++ {
+			c := img.At(x+b.Min.X, y+b.Min.Y)
+			r, g, b, a := c.RGBA()
+			data[dataOffset] = uint8(r >> 8)
+			data[dataOffset+1] = uint8(g >> 8)
+			data[dataOffset+2] = uint8(b >> 8)
+			data[dataOffset+3] = uint8(a >> 8)
+			dataOffset += 4
+		}
+	}
+	return ctx.CreateImageSimple(flags, w, h, ChannelOrderRGBA, ChannelDataTypeUNormInt8, data)
+}
--- a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/kernel.go
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/kernel.go
@ -0,0 +1,127 @@
+package cl
+
+// #ifdef __APPLE__
+// #include "OpenCL/opencl.h"
+// #else
+// #include "cl.h"
+// #endif
+import "C"
+
+import (
+	"fmt"
+	"unsafe"
+)
+
+type ErrUnsupportedArgumentType struct {
+	Index int
+	Value interface{}
+}
+
+func (e ErrUnsupportedArgumentType) Error() string {
+	return fmt.Sprintf("cl: unsupported argument type for index %d: %+v", e.Index, e.Value)
+}
+
+type Kernel struct {
+	clKernel C.cl_kernel
+	name     string
+}
+
+type LocalBuffer int
+
+func releaseKernel(k *Kernel) {
+	if k.clKernel != nil {
+		C.clReleaseKernel(k.clKernel)
+		k.clKernel = nil
+	}
+}
+
+func (k *Kernel) Release() {
+	releaseKernel(k)
+}
+
+func (k *Kernel) SetArgs(args ...interface{}) error {
+	for index, arg := range args {
+		if err := k.SetArg(index, arg); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (k *Kernel) SetArg(index int, arg interface{}) error {
+	switch val := arg.(type) {
+	case uint8:
+		return k.SetArgUint8(index, val)
+	case int8:
+		return k.SetArgInt8(index, val)
+	case uint32:
+		return k.SetArgUint32(index, val)
+	case uint64:
+		return k.SetArgUint64(index, val)
+	case int32:
+		return k.SetArgInt32(index, val)
+	case float32:
+		return k.SetArgFloat32(index, val)
+	case *MemObject:
+		return k.SetArgBuffer(index, val)
+	case LocalBuffer:
+		return k.SetArgLocal(index, int(val))
+	default:
+		return ErrUnsupportedArgumentType{Index: index, Value: arg}
+	}
+}
+
+func (k *Kernel) SetArgBuffer(index int, buffer *MemObject) error {
+	return k.SetArgUnsafe(index, int(unsafe.Sizeof(buffer.clMem)), unsafe.Pointer(&buffer.clMem))
+}
+
+func (k *Kernel) SetArgFloat32(index int, val float32) error {
+	return k.SetArgUnsafe(index, int(unsafe.Sizeof(val)), unsafe.Pointer(&val))
+}
+
+func (k *Kernel) SetArgInt8(index int, val int8) error {
+	return k.SetArgUnsafe(index, int(unsafe.Sizeof(val)), unsafe.Pointer(&val))
+}
+
+func (k *Kernel) SetArgUint8(index int, val uint8) error {
+	return k.SetArgUnsafe(index, int(unsafe.Sizeof(val)), unsafe.Pointer(&val))
+}
+
+func (k *Kernel) SetArgInt32(index int, val int32) error {
+	return k.SetArgUnsafe(index, int(unsafe.Sizeof(val)), unsafe.Pointer(&val))
+}
+
+func (k *Kernel) SetArgUint32(index int, val uint32) error {
+	return k.SetArgUnsafe(index, int(unsafe.Sizeof(val)), unsafe.Pointer(&val))
+}
+
+func (k *Kernel) SetArgUint64(index int, val uint64) error {
+	return k.SetArgUnsafe(index, int(unsafe.Sizeof(val)), unsafe.Pointer(&val))
+}
+
+func (k *Kernel) SetArgLocal(index int, size int) error {
+	return k.SetArgUnsafe(index, size, nil)
+}
+
+func (k *Kernel) SetArgUnsafe(index, argSize int, arg unsafe.Pointer) error {
+	//fmt.Println("FUNKY: ", index, argSize)
+	return toError(C.clSetKernelArg(k.clKernel, C.cl_uint(index), C.size_t(argSize), arg))
+}
+
+func (k *Kernel) PreferredWorkGroupSizeMultiple(device *Device) (int, error) {
+	var size C.size_t
+	err := C.clGetKernelWorkGroupInfo(k.clKernel, device.nullableId(), C.CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, C.size_t(unsafe.Sizeof(size)), unsafe.Pointer(&size), nil)
+	return int(size), toError(err)
+}
+
+func (k *Kernel) WorkGroupSize(device *Device) (int, error) {
+	var size C.size_t
+	err := C.clGetKernelWorkGroupInfo(k.clKernel, device.nullableId(), C.CL_KERNEL_WORK_GROUP_SIZE, C.size_t(unsafe.Sizeof(size)), unsafe.Pointer(&size), nil)
+	return int(size), toError(err)
+}
+
+func (k *Kernel) NumArgs() (int, error) {
+	var num C.cl_uint
+	err := C.clGetKernelInfo(k.clKernel, C.CL_KERNEL_NUM_ARGS, C.size_t(unsafe.Sizeof(num)), unsafe.Pointer(&num), nil)
+	return int(num), toError(err)
+}
--- a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/kernel10.go
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/kernel10.go
@ -0,0 +1,7 @@
+// +build !cl12
+
+package cl
+
+func (k *Kernel) ArgName(index int) (string, error) {
+	return "", ErrUnsupported
+}
--- a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/kernel12.go
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/kernel12.go
@ -0,0 +1,20 @@
+// +build cl12
+
+package cl
+
+// #ifdef __APPLE__
+// #include "OpenCL/opencl.h"
+// #else
+// #include "cl.h"
+// #endif
+import "C"
+import "unsafe"
+
+func (k *Kernel) ArgName(index int) (string, error) {
+	var strC [1024]byte
+	var strN C.size_t
+	if err := C.clGetKernelArgInfo(k.clKernel, C.cl_uint(index), C.CL_KERNEL_ARG_NAME, 1024, unsafe.Pointer(&strC[0]), &strN); err != C.CL_SUCCESS {
+		return "", toError(err)
+	}
+	return string(strC[:strN]), nil
+}
--- a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/platform.go
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/platform.go
@ -0,0 +1,83 @@
+package cl
+
+// #ifdef __APPLE__
+// #include "OpenCL/opencl.h"
+// #else
+// #include "cl.h"
+// #endif
+import "C"
+
+import "unsafe"
+
+const maxPlatforms = 32
+
+type Platform struct {
+	id C.cl_platform_id
+}
+
+// Obtain the list of platforms available.
+func GetPlatforms() ([]*Platform, error) {
+	var platformIds [maxPlatforms]C.cl_platform_id
+	var nPlatforms C.cl_uint
+	if err := C.clGetPlatformIDs(C.cl_uint(maxPlatforms), &platformIds[0], &nPlatforms); err != C.CL_SUCCESS {
+		return nil, toError(err)
+	}
+	platforms := make([]*Platform, nPlatforms)
+	for i := 0; i < int(nPlatforms); i++ {
+		platforms[i] = &Platform{id: platformIds[i]}
+	}
+	return platforms, nil
+}
+
+func (p *Platform) GetDevices(deviceType DeviceType) ([]*Device, error) {
+	return GetDevices(p, deviceType)
+}
+
+func (p *Platform) getInfoString(param C.cl_platform_info) (string, error) {
+	var strC [2048]byte
+	var strN C.size_t
+	if err := C.clGetPlatformInfo(p.id, param, 2048, unsafe.Pointer(&strC[0]), &strN); err != C.CL_SUCCESS {
+		return "", toError(err)
+	}
+	return string(strC[:(strN - 1)]), nil
+}
+
+func (p *Platform) Name() string {
+	if str, err := p.getInfoString(C.CL_PLATFORM_NAME); err != nil {
+		panic("Platform.Name() should never fail")
+	} else {
+		return str
+	}
+}
+
+func (p *Platform) Vendor() string {
+	if str, err := p.getInfoString(C.CL_PLATFORM_VENDOR); err != nil {
+		panic("Platform.Vendor() should never fail")
+	} else {
+		return str
+	}
+}
+
+func (p *Platform) Profile() string {
+	if str, err := p.getInfoString(C.CL_PLATFORM_PROFILE); err != nil {
+		panic("Platform.Profile() should never fail")
+	} else {
+		return str
+	}
+}
+
+func (p *Platform) Version() string {
+	if str, err := p.getInfoString(C.CL_PLATFORM_VERSION); err != nil {
+		panic("Platform.Version() should never fail")
+	} else {
+		return str
+	}
+}
+
+func (p *Platform) Extensions() string {
+	if str, err := p.getInfoString(C.CL_PLATFORM_EXTENSIONS); err != nil {
+		panic("Platform.Extensions() should never fail")
+	} else {
+		return str
+	}
+}
--- a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/program.go
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/program.go
@ -0,0 +1,105 @@
+package cl
+
+// #include <stdlib.h>
+// #ifdef __APPLE__
+// #include "OpenCL/opencl.h"
+// #else
+// #include "cl.h"
+// #endif
+import "C"
+
+import (
+	"fmt"
+	"runtime"
+	"unsafe"
+)
+
+type BuildError struct {
+	Message string
+	Device  *Device
+}
+
+func (e BuildError) Error() string {
+	if e.Device != nil {
+		return fmt.Sprintf("cl: build error on %q: %s", e.Device.Name(), e.Message)
+	} else {
+		return fmt.Sprintf("cl: build error: %s", e.Message)
+	}
+}
+
+type Program struct {
+	clProgram C.cl_program
+	devices   []*Device
+}
+
+func releaseProgram(p *Program) {
+	if p.clProgram != nil {
+		C.clReleaseProgram(p.clProgram)
+		p.clProgram = nil
+	}
+}
+
+func (p *Program) Release() {
+	releaseProgram(p)
+}
+
+func (p *Program) BuildProgram(devices []*Device, options string) error {
+	var cOptions *C.char
+	if options != "" {
+		cOptions = C.CString(options)
+		defer C.free(unsafe.Pointer(cOptions))
+	}
+	var deviceList []C.cl_device_id
+	var deviceListPtr *C.cl_device_id
+	numDevices := C.cl_uint(len(devices))
+	if devices != nil && len(devices) > 0 {
+		deviceList = buildDeviceIdList(devices)
+		deviceListPtr = &deviceList[0]
+	}
+	if err := C.clBuildProgram(p.clProgram, numDevices, deviceListPtr, cOptions, nil, nil); err != C.CL_SUCCESS {
+		buffer := make([]byte, 4096)
+		var bLen C.size_t
+		var err C.cl_int
+
+		for _, dev := range p.devices {
+			for i := 2; i >= 0; i-- {
+				err = C.clGetProgramBuildInfo(p.clProgram, dev.id, C.CL_PROGRAM_BUILD_LOG, C.size_t(len(buffer)), unsafe.Pointer(&buffer[0]), &bLen)
+				if err == C.CL_INVALID_VALUE && i > 0 && bLen < 1024*1024 {
+					// INVALID_VALUE probably means our buffer isn't large enough
+					buffer = make([]byte, bLen)
+				} else {
+					break
+				}
+			}
+			if err != C.CL_SUCCESS {
+				return toError(err)
+			}
+
+			if bLen > 1 {
+				return BuildError{
+					Device:  dev,
+					Message: string(buffer[:bLen-1]),
+				}
+			}
+		}
+
+		return BuildError{
+			Device:  nil,
+			Message: "build failed and produced no log entries",
+		}
+	}
+	return nil
+}
+
+func (p *Program) CreateKernel(name string) (*Kernel, error) {
+	cName := C.CString(name)
+	defer C.free(unsafe.Pointer(cName))
+	var err C.cl_int
+	clKernel := C.clCreateKernel(p.clProgram, cName, &err)
+	if err != C.CL_SUCCESS {
+		return nil, toError(err)
+	}
+	kernel := &Kernel{clKernel: clKernel, name: name}
+	runtime.SetFinalizer(kernel, releaseKernel)
+	return kernel, nil
+}
--- a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/queue.go
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/queue.go
@ -0,0 +1,193 @@
+package cl
+
+// #ifdef __APPLE__
+// #include "OpenCL/opencl.h"
+// #else
+// #include "cl.h"
+// #endif
+import "C"
+
+import "unsafe"
+
+type CommandQueueProperty int
+
+const (
+	CommandQueueOutOfOrderExecModeEnable CommandQueueProperty = C.CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
+	CommandQueueProfilingEnable          CommandQueueProperty = C.CL_QUEUE_PROFILING_ENABLE
+)
+
+type CommandQueue struct {
+	clQueue C.cl_command_queue
+	device  *Device
+}
+
+func releaseCommandQueue(q *CommandQueue) {
+	if q.clQueue != nil {
+		C.clReleaseCommandQueue(q.clQueue)
+		q.clQueue = nil
+	}
+}
+
+// Call clReleaseCommandQueue on the CommandQueue. Using the CommandQueue after Release will cause a panick.
+func (q *CommandQueue) Release() {
+	releaseCommandQueue(q)
+}
+
+// Blocks until all previously queued OpenCL commands in a command-queue are issued to the associated device and have completed.
+func (q *CommandQueue) Finish() error {
+	return toError(C.clFinish(q.clQueue))
+}
+
+// Issues all previously queued OpenCL commands in a command-queue to the device associated with the command-queue.
+func (q *CommandQueue) Flush() error {
+	return toError(C.clFlush(q.clQueue))
+}
+
+// Enqueues a command to map a region of the buffer object given by buffer into the host address space and returns a pointer to this mapped region.
+func (q *CommandQueue) EnqueueMapBuffer(buffer *MemObject, blocking bool, flags MapFlag, offset, size int, eventWaitList []*Event) (*MappedMemObject, *Event, error) {
+	var event C.cl_event
+	var err C.cl_int
+	ptr := C.clEnqueueMapBuffer(q.clQueue, buffer.clMem, clBool(blocking), flags.toCl(), C.size_t(offset), C.size_t(size), C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event, &err)
+	if err != C.CL_SUCCESS {
+		return nil, nil, toError(err)
+	}
+	ev := newEvent(event)
+	if ptr == nil {
+		return nil, ev, ErrUnknown
+	}
+	return &MappedMemObject{ptr: ptr, size: size}, ev, nil
+}
+
+// Enqueues a command to map a region of an image object into the host address space and returns a pointer to this mapped region.
+func (q *CommandQueue) EnqueueMapImage(buffer *MemObject, blocking bool, flags MapFlag, origin, region [3]int, eventWaitList []*Event) (*MappedMemObject, *Event, error) {
+	cOrigin := sizeT3(origin)
+	cRegion := sizeT3(region)
+	var event C.cl_event
+	var err C.cl_int
+	var rowPitch, slicePitch C.size_t
+	ptr := C.clEnqueueMapImage(q.clQueue, buffer.clMem, clBool(blocking), flags.toCl(), &cOrigin[0], &cRegion[0], &rowPitch, &slicePitch, C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event, &err)
+	if err != C.CL_SUCCESS {
+		return nil, nil, toError(err)
+	}
+	ev := newEvent(event)
+	if ptr == nil {
+		return nil, ev, ErrUnknown
+	}
+	size := 0 // TODO: could calculate this
+	return &MappedMemObject{ptr: ptr, size: size, rowPitch: int(rowPitch), slicePitch: int(slicePitch)}, ev, nil
+}
+
+// Enqueues a command to unmap a previously mapped region of a memory object.
+func (q *CommandQueue) EnqueueUnmapMemObject(buffer *MemObject, mappedObj *MappedMemObject, eventWaitList []*Event) (*Event, error) {
+	var event C.cl_event
+	if err := C.clEnqueueUnmapMemObject(q.clQueue, buffer.clMem, mappedObj.ptr, C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event); err != C.CL_SUCCESS {
+		return nil, toError(err)
+	}
+	return newEvent(event), nil
+}
+
+// Enqueues a command to copy a buffer object to another buffer object.
+func (q *CommandQueue) EnqueueCopyBuffer(srcBuffer, dstBuffer *MemObject, srcOffset, dstOffset, byteCount int, eventWaitList []*Event) (*Event, error) {
+	var event C.cl_event
+	err := toError(C.clEnqueueCopyBuffer(q.clQueue, srcBuffer.clMem, dstBuffer.clMem, C.size_t(srcOffset), C.size_t(dstOffset), C.size_t(byteCount), C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event))
+	return newEvent(event), err
+}
+
+// Enqueue commands to write to a buffer object from host memory.
+func (q *CommandQueue) EnqueueWriteBuffer(buffer *MemObject, blocking bool, offset, dataSize int, dataPtr unsafe.Pointer, eventWaitList []*Event) (*Event, error) {
+	var event C.cl_event
+	err := toError(C.clEnqueueWriteBuffer(q.clQueue, buffer.clMem, clBool(blocking), C.size_t(offset), C.size_t(dataSize), dataPtr, C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event))
+	return newEvent(event), err
+}
+
+func (q *CommandQueue) EnqueueWriteBufferFloat32(buffer *MemObject, blocking bool, offset int, data []float32, eventWaitList []*Event) (*Event, error) {
+	dataPtr := unsafe.Pointer(&data[0])
+	dataSize := int(unsafe.Sizeof(data[0])) * len(data)
+	return q.EnqueueWriteBuffer(buffer, blocking, offset, dataSize, dataPtr, eventWaitList)
+}
+
+// Enqueue commands to read from a buffer object to host memory.
+func (q *CommandQueue) EnqueueReadBuffer(buffer *MemObject, blocking bool, offset, dataSize int, dataPtr unsafe.Pointer, eventWaitList []*Event) (*Event, error) {
+	var event C.cl_event
+	err := toError(C.clEnqueueReadBuffer(q.clQueue, buffer.clMem, clBool(blocking), C.size_t(offset), C.size_t(dataSize), dataPtr, C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event))
+	return newEvent(event), err
+}
+
+func (q *CommandQueue) EnqueueReadBufferFloat32(buffer *MemObject, blocking bool, offset int, data []float32, eventWaitList []*Event) (*Event, error) {
+	dataPtr := unsafe.Pointer(&data[0])
+	dataSize := int(unsafe.Sizeof(data[0])) * len(data)
+	return q.EnqueueReadBuffer(buffer, blocking, offset, dataSize, dataPtr, eventWaitList)
+}
+
+// Enqueues a command to execute a kernel on a device.
+func (q *CommandQueue) EnqueueNDRangeKernel(kernel *Kernel, globalWorkOffset, globalWorkSize, localWorkSize []int, eventWaitList []*Event) (*Event, error) {
+	workDim := len(globalWorkSize)
+	var globalWorkOffsetList []C.size_t
+	var globalWorkOffsetPtr *C.size_t
+	if globalWorkOffset != nil {
+		globalWorkOffsetList = make([]C.size_t, len(globalWorkOffset))
+		for i, off := range globalWorkOffset {
+			globalWorkOffsetList[i] = C.size_t(off)
+		}
+		globalWorkOffsetPtr = &globalWorkOffsetList[0]
+	}
+	var globalWorkSizeList []C.size_t
+	var globalWorkSizePtr *C.size_t
+	if globalWorkSize != nil {
+		globalWorkSizeList = make([]C.size_t, len(globalWorkSize))
+		for i, off := range globalWorkSize {
+			globalWorkSizeList[i] = C.size_t(off)
+		}
+		globalWorkSizePtr = &globalWorkSizeList[0]
+	}
+	var localWorkSizeList []C.size_t
+	var localWorkSizePtr *C.size_t
+	if localWorkSize != nil {
+		localWorkSizeList = make([]C.size_t, len(localWorkSize))
+		for i, off := range localWorkSize {
+			localWorkSizeList[i] = C.size_t(off)
+		}
+		localWorkSizePtr = &localWorkSizeList[0]
+	}
+	var event C.cl_event
+	err := toError(C.clEnqueueNDRangeKernel(q.clQueue, kernel.clKernel, C.cl_uint(workDim), globalWorkOffsetPtr, globalWorkSizePtr, localWorkSizePtr, C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event))
+	return newEvent(event), err
+}
+
+// Enqueues a command to read from a 2D or 3D image object to host memory.
+func (q *CommandQueue) EnqueueReadImage(image *MemObject, blocking bool, origin, region [3]int, rowPitch, slicePitch int, data []byte, eventWaitList []*Event) (*Event, error) {
+	cOrigin := sizeT3(origin)
+	cRegion := sizeT3(region)
+	var event C.cl_event
+	err := toError(C.clEnqueueReadImage(q.clQueue, image.clMem, clBool(blocking), &cOrigin[0], &cRegion[0], C.size_t(rowPitch), C.size_t(slicePitch), unsafe.Pointer(&data[0]), C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event))
+	return newEvent(event), err
+}
+
+// Enqueues a command to write from a 2D or 3D image object to host memory.
+func (q *CommandQueue) EnqueueWriteImage(image *MemObject, blocking bool, origin, region [3]int, rowPitch, slicePitch int, data []byte, eventWaitList []*Event) (*Event, error) {
+	cOrigin := sizeT3(origin)
+	cRegion := sizeT3(region)
+	var event C.cl_event
+	err := toError(C.clEnqueueWriteImage(q.clQueue, image.clMem, clBool(blocking), &cOrigin[0], &cRegion[0], C.size_t(rowPitch), C.size_t(slicePitch), unsafe.Pointer(&data[0]), C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event))
+	return newEvent(event), err
+}
+
+func (q *CommandQueue) EnqueueFillBuffer(buffer *MemObject, pattern unsafe.Pointer, patternSize, offset, size int, eventWaitList []*Event) (*Event, error) {
+	var event C.cl_event
+	err := toError(C.clEnqueueFillBuffer(q.clQueue, buffer.clMem, pattern, C.size_t(patternSize), C.size_t(offset), C.size_t(size), C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event))
+	return newEvent(event), err
+}
+
+// A synchronization point that enqueues a barrier operation.
+func (q *CommandQueue) EnqueueBarrierWithWaitList(eventWaitList []*Event) (*Event, error) {
+	var event C.cl_event
+	err := toError(C.clEnqueueBarrierWithWaitList(q.clQueue, C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event))
+	return newEvent(event), err
+}
+
+// Enqueues a marker command which waits for either a list of events to complete, or all previously enqueued commands to complete.
+func (q *CommandQueue) EnqueueMarkerWithWaitList(eventWaitList []*Event) (*Event, error) {
+	var event C.cl_event
+	err := toError(C.clEnqueueMarkerWithWaitList(q.clQueue, C.cl_uint(len(eventWaitList)), eventListPtr(eventWaitList), &event))
+	return newEvent(event), err
+}
--- a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/types.go
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/types.go
@ -0,0 +1,487 @@
+package cl
+
+// #ifdef __APPLE__
+// #include "OpenCL/opencl.h"
+// #else
+// #include "cl.h"
+// #endif
+import "C"
+
+import (
+	"errors"
+	"fmt"
+	"reflect"
+	"runtime"
+	"strings"
+	"unsafe"
+)
+
+var (
+	ErrUnknown = errors.New("cl: unknown error") // Generally an unexpected result from an OpenCL function (e.g. CL_SUCCESS but null pointer)
+)
+
+type ErrOther int
+
+func (e ErrOther) Error() string {
+	return fmt.Sprintf("cl: error %d", int(e))
+}
+
+var (
+	ErrDeviceNotFound                     = errors.New("cl: Device Not Found")
+	ErrDeviceNotAvailable                 = errors.New("cl: Device Not Available")
+	ErrCompilerNotAvailable               = errors.New("cl: Compiler Not Available")
+	ErrMemObjectAllocationFailure         = errors.New("cl: Mem Object Allocation Failure")
+	ErrOutOfResources                     = errors.New("cl: Out Of Resources")
+	ErrOutOfHostMemory                    = errors.New("cl: Out Of Host Memory")
+	ErrProfilingInfoNotAvailable          = errors.New("cl: Profiling Info Not Available")
+	ErrMemCopyOverlap                     = errors.New("cl: Mem Copy Overlap")
+	ErrImageFormatMismatch                = errors.New("cl: Image Format Mismatch")
+	ErrImageFormatNotSupported            = errors.New("cl: Image Format Not Supported")
+	ErrBuildProgramFailure                = errors.New("cl: Build Program Failure")
+	ErrMapFailure                         = errors.New("cl: Map Failure")
+	ErrMisalignedSubBufferOffset          = errors.New("cl: Misaligned Sub Buffer Offset")
+	ErrExecStatusErrorForEventsInWaitList = errors.New("cl: Exec Status Error For Events In Wait List")
+	ErrCompileProgramFailure              = errors.New("cl: Compile Program Failure")
+	ErrLinkerNotAvailable                 = errors.New("cl: Linker Not Available")
+	ErrLinkProgramFailure                 = errors.New("cl: Link Program Failure")
+	ErrDevicePartitionFailed              = errors.New("cl: Device Partition Failed")
+	ErrKernelArgInfoNotAvailable          = errors.New("cl: Kernel Arg Info Not Available")
+	ErrInvalidValue                       = errors.New("cl: Invalid Value")
+	ErrInvalidDeviceType                  = errors.New("cl: Invalid Device Type")
+	ErrInvalidPlatform                    = errors.New("cl: Invalid Platform")
+	ErrInvalidDevice                      = errors.New("cl: Invalid Device")
+	ErrInvalidContext                     = errors.New("cl: Invalid Context")
+	ErrInvalidQueueProperties             = errors.New("cl: Invalid Queue Properties")
+	ErrInvalidCommandQueue                = errors.New("cl: Invalid Command Queue")
+	ErrInvalidHostPtr                     = errors.New("cl: Invalid Host Ptr")
+	ErrInvalidMemObject                   = errors.New("cl: Invalid Mem Object")
+	ErrInvalidImageFormatDescriptor       = errors.New("cl: Invalid Image Format Descriptor")
+	ErrInvalidImageSize                   = errors.New("cl: Invalid Image Size")
+	ErrInvalidSampler                     = errors.New("cl: Invalid Sampler")
+	ErrInvalidBinary                      = errors.New("cl: Invalid Binary")
+	ErrInvalidBuildOptions                = errors.New("cl: Invalid Build Options")
+	ErrInvalidProgram                     = errors.New("cl: Invalid Program")
+	ErrInvalidProgramExecutable           = errors.New("cl: Invalid Program Executable")
+	ErrInvalidKernelName                  = errors.New("cl: Invalid Kernel Name")
+	ErrInvalidKernelDefinition            = errors.New("cl: Invalid Kernel Definition")
+	ErrInvalidKernel                      = errors.New("cl: Invalid Kernel")
+	ErrInvalidArgIndex                    = errors.New("cl: Invalid Arg Index")
+	ErrInvalidArgValue                    = errors.New("cl: Invalid Arg Value")
+	ErrInvalidArgSize                     = errors.New("cl: Invalid Arg Size")
+	ErrInvalidKernelArgs                  = errors.New("cl: Invalid Kernel Args")
+	ErrInvalidWorkDimension               = errors.New("cl: Invalid Work Dimension")
+	ErrInvalidWorkGroupSize               = errors.New("cl: Invalid Work Group Size")
+	ErrInvalidWorkItemSize                = errors.New("cl: Invalid Work Item Size")
+	ErrInvalidGlobalOffset                = errors.New("cl: Invalid Global Offset")
+	ErrInvalidEventWaitList               = errors.New("cl: Invalid Event Wait List")
+	ErrInvalidEvent                       = errors.New("cl: Invalid Event")
+	ErrInvalidOperation                   = errors.New("cl: Invalid Operation")
+	ErrInvalidGlObject                    = errors.New("cl: Invalid Gl Object")
+	ErrInvalidBufferSize                  = errors.New("cl: Invalid Buffer Size")
+	ErrInvalidMipLevel                    = errors.New("cl: Invalid Mip Level")
+	ErrInvalidGlobalWorkSize              = errors.New("cl: Invalid Global Work Size")
+	ErrInvalidProperty                    = errors.New("cl: Invalid Property")
+	ErrInvalidImageDescriptor             = errors.New("cl: Invalid Image Descriptor")
+	ErrInvalidCompilerOptions             = errors.New("cl: Invalid Compiler Options")
+	ErrInvalidLinkerOptions               = errors.New("cl: Invalid Linker Options")
+	ErrInvalidDevicePartitionCount        = errors.New("cl: Invalid Device Partition Count")
+)
+var errorMap = map[C.cl_int]error{
+	C.CL_SUCCESS:                                   nil,
+	C.CL_DEVICE_NOT_FOUND:                          ErrDeviceNotFound,
+	C.CL_DEVICE_NOT_AVAILABLE:                      ErrDeviceNotAvailable,
+	C.CL_COMPILER_NOT_AVAILABLE:                    ErrCompilerNotAvailable,
+	C.CL_MEM_OBJECT_ALLOCATION_FAILURE:             ErrMemObjectAllocationFailure,
+	C.CL_OUT_OF_RESOURCES:                          ErrOutOfResources,
+	C.CL_OUT_OF_HOST_MEMORY:                        ErrOutOfHostMemory,
+	C.CL_PROFILING_INFO_NOT_AVAILABLE:              ErrProfilingInfoNotAvailable,
+	C.CL_MEM_COPY_OVERLAP:                          ErrMemCopyOverlap,
+	C.CL_IMAGE_FORMAT_MISMATCH:                     ErrImageFormatMismatch,
+	C.CL_IMAGE_FORMAT_NOT_SUPPORTED:                ErrImageFormatNotSupported,
+	C.CL_BUILD_PROGRAM_FAILURE:                     ErrBuildProgramFailure,
+	C.CL_MAP_FAILURE:                               ErrMapFailure,
+	C.CL_MISALIGNED_SUB_BUFFER_OFFSET:              ErrMisalignedSubBufferOffset,
+	C.CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: ErrExecStatusErrorForEventsInWaitList,
+	C.CL_INVALID_VALUE:                             ErrInvalidValue,
+	C.CL_INVALID_DEVICE_TYPE:                       ErrInvalidDeviceType,
+	C.CL_INVALID_PLATFORM:                          ErrInvalidPlatform,
+	C.CL_INVALID_DEVICE:                            ErrInvalidDevice,
+	C.CL_INVALID_CONTEXT:                           ErrInvalidContext,
+	C.CL_INVALID_QUEUE_PROPERTIES:                  ErrInvalidQueueProperties,
+	C.CL_INVALID_COMMAND_QUEUE:                     ErrInvalidCommandQueue,
+	C.CL_INVALID_HOST_PTR:                          ErrInvalidHostPtr,
+	C.CL_INVALID_MEM_OBJECT:                        ErrInvalidMemObject,
+	C.CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:           ErrInvalidImageFormatDescriptor,
+	C.CL_INVALID_IMAGE_SIZE:                        ErrInvalidImageSize,
+	C.CL_INVALID_SAMPLER:                           ErrInvalidSampler,
+	C.CL_INVALID_BINARY:                            ErrInvalidBinary,
+	C.CL_INVALID_BUILD_OPTIONS:                     ErrInvalidBuildOptions,
+	C.CL_INVALID_PROGRAM:                           ErrInvalidProgram,
+	C.CL_INVALID_PROGRAM_EXECUTABLE:                ErrInvalidProgramExecutable,
+	C.CL_INVALID_KERNEL_NAME:                       ErrInvalidKernelName,
+	C.CL_INVALID_KERNEL_DEFINITION:                 ErrInvalidKernelDefinition,
+	C.CL_INVALID_KERNEL:                            ErrInvalidKernel,
+	C.CL_INVALID_ARG_INDEX:                         ErrInvalidArgIndex,
+	C.CL_INVALID_ARG_VALUE:                         ErrInvalidArgValue,
+	C.CL_INVALID_ARG_SIZE:                          ErrInvalidArgSize,
+	C.CL_INVALID_KERNEL_ARGS:                       ErrInvalidKernelArgs,
+	C.CL_INVALID_WORK_DIMENSION:                    ErrInvalidWorkDimension,
+	C.CL_INVALID_WORK_GROUP_SIZE:                   ErrInvalidWorkGroupSize,
+	C.CL_INVALID_WORK_ITEM_SIZE:                    ErrInvalidWorkItemSize,
+	C.CL_INVALID_GLOBAL_OFFSET:                     ErrInvalidGlobalOffset,
+	C.CL_INVALID_EVENT_WAIT_LIST:                   ErrInvalidEventWaitList,
+	C.CL_INVALID_EVENT:                             ErrInvalidEvent,
+	C.CL_INVALID_OPERATION:                         ErrInvalidOperation,
+	C.CL_INVALID_GL_OBJECT:                         ErrInvalidGlObject,
+	C.CL_INVALID_BUFFER_SIZE:                       ErrInvalidBufferSize,
+	C.CL_INVALID_MIP_LEVEL:                         ErrInvalidMipLevel,
+	C.CL_INVALID_GLOBAL_WORK_SIZE:                  ErrInvalidGlobalWorkSize,
+	C.CL_INVALID_PROPERTY:                          ErrInvalidProperty,
+}
+
+func toError(code C.cl_int) error {
+	if err, ok := errorMap[code]; ok {
+		return err
+	}
+	return ErrOther(code)
+}
+
+type LocalMemType int
+
+const (
+	LocalMemTypeNone   LocalMemType = C.CL_NONE
+	LocalMemTypeGlobal LocalMemType = C.CL_GLOBAL
+	LocalMemTypeLocal  LocalMemType = C.CL_LOCAL
+)
+
+var localMemTypeMap = map[LocalMemType]string{
+	LocalMemTypeNone:   "None",
+	LocalMemTypeGlobal: "Global",
+	LocalMemTypeLocal:  "Local",
+}
+
+func (t LocalMemType) String() string {
+	name := localMemTypeMap[t]
+	if name == "" {
+		name = "Unknown"
+	}
+	return name
+}
+
+type ExecCapability int
+
+const (
+	ExecCapabilityKernel       ExecCapability = C.CL_EXEC_KERNEL        // The OpenCL device can execute OpenCL kernels.
+	ExecCapabilityNativeKernel ExecCapability = C.CL_EXEC_NATIVE_KERNEL // The OpenCL device can execute native kernels.
+)
+
+func (ec ExecCapability) String() string {
+	var parts []string
+	if ec&ExecCapabilityKernel != 0 {
+		parts = append(parts, "Kernel")
+	}
+	if ec&ExecCapabilityNativeKernel != 0 {
+		parts = append(parts, "NativeKernel")
+	}
+	if parts == nil {
+		return ""
+	}
+	return strings.Join(parts, "|")
+}
+
+type MemCacheType int
+
+const (
+	MemCacheTypeNone           MemCacheType = C.CL_NONE
+	MemCacheTypeReadOnlyCache  MemCacheType = C.CL_READ_ONLY_CACHE
+	MemCacheTypeReadWriteCache MemCacheType = C.CL_READ_WRITE_CACHE
+)
+
+func (ct MemCacheType) String() string {
+	switch ct {
+	case MemCacheTypeNone:
+		return "None"
+	case MemCacheTypeReadOnlyCache:
+		return "ReadOnly"
+	case MemCacheTypeReadWriteCache:
+		return "ReadWrite"
+	}
+	return fmt.Sprintf("Unknown(%x)", int(ct))
+}
+
+type MemFlag int
+
+const (
+	MemReadWrite    MemFlag = C.CL_MEM_READ_WRITE
+	MemWriteOnly    MemFlag = C.CL_MEM_WRITE_ONLY
+	MemReadOnly     MemFlag = C.CL_MEM_READ_ONLY
+	MemUseHostPtr   MemFlag = C.CL_MEM_USE_HOST_PTR
+	MemAllocHostPtr MemFlag = C.CL_MEM_ALLOC_HOST_PTR
+	MemCopyHostPtr  MemFlag = C.CL_MEM_COPY_HOST_PTR
+
+	MemWriteOnlyHost MemFlag = C.CL_MEM_HOST_WRITE_ONLY
+	MemReadOnlyHost  MemFlag = C.CL_MEM_HOST_READ_ONLY
+	MemNoAccessHost  MemFlag = C.CL_MEM_HOST_NO_ACCESS
+)
+
+type MemObjectType int
+
+const (
+	MemObjectTypeBuffer  MemObjectType = C.CL_MEM_OBJECT_BUFFER
+	MemObjectTypeImage2D MemObjectType = C.CL_MEM_OBJECT_IMAGE2D
+	MemObjectTypeImage3D MemObjectType = C.CL_MEM_OBJECT_IMAGE3D
+)
+
+type MapFlag int
+
+const (
+	// This flag specifies that the region being mapped in the memory object is being mapped for reading.
+	MapFlagRead                  MapFlag = C.CL_MAP_READ
+	MapFlagWrite                 MapFlag = C.CL_MAP_WRITE
+	MapFlagWriteInvalidateRegion MapFlag = C.CL_MAP_WRITE_INVALIDATE_REGION
+)
+
+func (mf MapFlag) toCl() C.cl_map_flags {
+	return C.cl_map_flags(mf)
+}
+
+type ChannelOrder int
+
+const (
+	ChannelOrderR         ChannelOrder = C.CL_R
+	ChannelOrderA         ChannelOrder = C.CL_A
+	ChannelOrderRG        ChannelOrder = C.CL_RG
+	ChannelOrderRA        ChannelOrder = C.CL_RA
+	ChannelOrderRGB       ChannelOrder = C.CL_RGB
+	ChannelOrderRGBA      ChannelOrder = C.CL_RGBA
+	ChannelOrderBGRA      ChannelOrder = C.CL_BGRA
+	ChannelOrderARGB      ChannelOrder = C.CL_ARGB
+	ChannelOrderIntensity ChannelOrder = C.CL_INTENSITY
+	ChannelOrderLuminance ChannelOrder = C.CL_LUMINANCE
+	ChannelOrderRx        ChannelOrder = C.CL_Rx
+	ChannelOrderRGx       ChannelOrder = C.CL_RGx
+	ChannelOrderRGBx      ChannelOrder = C.CL_RGBx
+)
+
+var channelOrderNameMap = map[ChannelOrder]string{
+	ChannelOrderR:         "R",
+	ChannelOrderA:         "A",
+	ChannelOrderRG:        "RG",
+	ChannelOrderRA:        "RA",
+	ChannelOrderRGB:       "RGB",
+	ChannelOrderRGBA:      "RGBA",
+	ChannelOrderBGRA:      "BGRA",
+	ChannelOrderARGB:      "ARGB",
+	ChannelOrderIntensity: "Intensity",
+	ChannelOrderLuminance: "Luminance",
+	ChannelOrderRx:        "Rx",
+	ChannelOrderRGx:       "RGx",
+	ChannelOrderRGBx:      "RGBx",
+}
+
+func (co ChannelOrder) String() string {
+	name := channelOrderNameMap[co]
+	if name == "" {
+		name = fmt.Sprintf("Unknown(%x)", int(co))
+	}
+	return name
+}
+
+type ChannelDataType int
+
+const (
+	ChannelDataTypeSNormInt8      ChannelDataType = C.CL_SNORM_INT8
+	ChannelDataTypeSNormInt16     ChannelDataType = C.CL_SNORM_INT16
+	ChannelDataTypeUNormInt8      ChannelDataType = C.CL_UNORM_INT8
+	ChannelDataTypeUNormInt16     ChannelDataType = C.CL_UNORM_INT16
+	ChannelDataTypeUNormShort565  ChannelDataType = C.CL_UNORM_SHORT_565
+	ChannelDataTypeUNormShort555  ChannelDataType = C.CL_UNORM_SHORT_555
+	ChannelDataTypeUNormInt101010 ChannelDataType = C.CL_UNORM_INT_101010
+	ChannelDataTypeSignedInt8     ChannelDataType = C.CL_SIGNED_INT8
+	ChannelDataTypeSignedInt16    ChannelDataType = C.CL_SIGNED_INT16
+	ChannelDataTypeSignedInt32    ChannelDataType = C.CL_SIGNED_INT32
+	ChannelDataTypeUnsignedInt8   ChannelDataType = C.CL_UNSIGNED_INT8
+	ChannelDataTypeUnsignedInt16  ChannelDataType = C.CL_UNSIGNED_INT16
+	ChannelDataTypeUnsignedInt32  ChannelDataType = C.CL_UNSIGNED_INT32
+	ChannelDataTypeHalfFloat      ChannelDataType = C.CL_HALF_FLOAT
+	ChannelDataTypeFloat          ChannelDataType = C.CL_FLOAT
+)
+
+var channelDataTypeNameMap = map[ChannelDataType]string{
+	ChannelDataTypeSNormInt8:      "SNormInt8",
+	ChannelDataTypeSNormInt16:     "SNormInt16",
+	ChannelDataTypeUNormInt8:      "UNormInt8",
+	ChannelDataTypeUNormInt16:     "UNormInt16",
+	ChannelDataTypeUNormShort565:  "UNormShort565",
+	ChannelDataTypeUNormShort555:  "UNormShort555",
+	ChannelDataTypeUNormInt101010: "UNormInt101010",
+	ChannelDataTypeSignedInt8:     "SignedInt8",
+	ChannelDataTypeSignedInt16:    "SignedInt16",
+	ChannelDataTypeSignedInt32:    "SignedInt32",
+	ChannelDataTypeUnsignedInt8:   "UnsignedInt8",
+	ChannelDataTypeUnsignedInt16:  "UnsignedInt16",
+	ChannelDataTypeUnsignedInt32:  "UnsignedInt32",
+	ChannelDataTypeHalfFloat:      "HalfFloat",
+	ChannelDataTypeFloat:          "Float",
+}
+
+func (ct ChannelDataType) String() string {
+	name := channelDataTypeNameMap[ct]
+	if name == "" {
+		name = fmt.Sprintf("Unknown(%x)", int(ct))
+	}
+	return name
+}
+
+type ImageFormat struct {
+	ChannelOrder    ChannelOrder
+	ChannelDataType ChannelDataType
+}
+
+func (f ImageFormat) toCl() C.cl_image_format {
+	var format C.cl_image_format
+	format.image_channel_order = C.cl_channel_order(f.ChannelOrder)
+	format.image_channel_data_type = C.cl_channel_type(f.ChannelDataType)
+	return format
+}
+
+type ProfilingInfo int
+
+const (
+	// A 64-bit value that describes the current device time counter in
+	// nanoseconds when the command identified by event is enqueued in
+	// a command-queue by the host.
+	ProfilingInfoCommandQueued ProfilingInfo = C.CL_PROFILING_COMMAND_QUEUED
+	// A 64-bit value that describes the current device time counter in
+	// nanoseconds when the command identified by event that has been
+	// enqueued is submitted by the host to the device associated with the command-queue.
+	ProfilingInfoCommandSubmit ProfilingInfo = C.CL_PROFILING_COMMAND_SUBMIT
+	// A 64-bit value that describes the current device time counter in
+	// nanoseconds when the command identified by event starts execution on the device.
+	ProfilingInfoCommandStart ProfilingInfo = C.CL_PROFILING_COMMAND_START
+	// A 64-bit value that describes the current device time counter in
+	// nanoseconds when the command identified by event has finished
+	// execution on the device.
+	ProfilingInfoCommandEnd ProfilingInfo = C.CL_PROFILING_COMMAND_END
+)
+
+type CommmandExecStatus int
+
+const (
+	CommmandExecStatusComplete  CommmandExecStatus = C.CL_COMPLETE
+	CommmandExecStatusRunning   CommmandExecStatus = C.CL_RUNNING
+	CommmandExecStatusSubmitted CommmandExecStatus = C.CL_SUBMITTED
+	CommmandExecStatusQueued    CommmandExecStatus = C.CL_QUEUED
+)
+
+type Event struct {
+	clEvent C.cl_event
+}
+
+func releaseEvent(ev *Event) {
+	if ev.clEvent != nil {
+		C.clReleaseEvent(ev.clEvent)
+		ev.clEvent = nil
+	}
+}
+
+func (e *Event) Release() {
+	releaseEvent(e)
+}
+
+func (e *Event) GetEventProfilingInfo(paramName ProfilingInfo) (int64, error) {
+	var paramValue C.cl_ulong
+	if err := C.clGetEventProfilingInfo(e.clEvent, C.cl_profiling_info(paramName), C.size_t(unsafe.Sizeof(paramValue)), unsafe.Pointer(&paramValue), nil); err != C.CL_SUCCESS {
+		return 0, toError(err)
+	}
+	return int64(paramValue), nil
+}
+
+// Sets the execution status of a user event object.
+//
+// `status` specifies the new execution status to be set and
+// can be CL_COMPLETE or a negative integer value to indicate
+// an error. A negative integer value causes all enqueued commands
+// that wait on this user event to be terminated. clSetUserEventStatus
+// can only be called once to change the execution status of event.
+func (e *Event) SetUserEventStatus(status int) error {
+	return toError(C.clSetUserEventStatus(e.clEvent, C.cl_int(status)))
+}
+
+// Waits on the host thread for commands identified by event objects in
+// events to complete. A command is considered complete if its execution
+// status is CL_COMPLETE or a negative value. The events specified in
+// event_list act as synchronization points.
+//
+// If the cl_khr_gl_event extension is enabled, event objects can also be
+// used to reflect the status of an OpenGL sync object. The sync object
+// in turn refers to a fence command executing in an OpenGL command
+// stream. This provides another method of coordinating sharing of buffers
+// and images between OpenGL and OpenCL.
+func WaitForEvents(events []*Event) error {
+	return toError(C.clWaitForEvents(C.cl_uint(len(events)), eventListPtr(events)))
+}
+
+func newEvent(clEvent C.cl_event) *Event {
+	ev := &Event{clEvent: clEvent}
+	runtime.SetFinalizer(ev, releaseEvent)
+	return ev
+}
+
+func eventListPtr(el []*Event) *C.cl_event {
+	if el == nil {
+		return nil
+	}
+	elist := make([]C.cl_event, len(el))
+	for i, e := range el {
+		elist[i] = e.clEvent
+	}
+	return (*C.cl_event)(&elist[0])
+}
+
+func clBool(b bool) C.cl_bool {
+	if b {
+		return C.CL_TRUE
+	}
+	return C.CL_FALSE
+}
+
+func sizeT3(i3 [3]int) [3]C.size_t {
+	var val [3]C.size_t
+	val[0] = C.size_t(i3[0])
+	val[1] = C.size_t(i3[1])
+	val[2] = C.size_t(i3[2])
+	return val
+}
+
+type MappedMemObject struct {
+	ptr        unsafe.Pointer
+	size       int
+	rowPitch   int
+	slicePitch int
+}
+
+func (mb *MappedMemObject) ByteSlice() []byte {
+	var byteSlice []byte
+	sliceHeader := (*reflect.SliceHeader)(unsafe.Pointer(&byteSlice))
+	sliceHeader.Cap = mb.size
+	sliceHeader.Len = mb.size
+	sliceHeader.Data = uintptr(mb.ptr)
+	return byteSlice
+}
+
+func (mb *MappedMemObject) Ptr() unsafe.Pointer {
+	return mb.ptr
+}
+
+func (mb *MappedMemObject) Size() int {
+	return mb.size
+}
+
+func (mb *MappedMemObject) RowPitch() int {
+	return mb.rowPitch
+}
+
+func (mb *MappedMemObject) SlicePitch() int {
+	return mb.slicePitch
+}
--- a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/types12.go
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/types12.go
@ -0,0 +1,71 @@
+// +build cl12
+
+package cl
+
+// #ifdef __APPLE__
+// #include "OpenCL/opencl.h"
+// #else
+// #include "cl.h"
+// #endif
+import "C"
+
+const (
+	ChannelDataTypeUNormInt24  ChannelDataType = C.CL_UNORM_INT24
+	ChannelOrderDepth          ChannelOrder    = C.CL_DEPTH
+	ChannelOrderDepthStencil   ChannelOrder    = C.CL_DEPTH_STENCIL
+	MemHostNoAccess            MemFlag         = C.CL_MEM_HOST_NO_ACCESS  // OpenCL 1.2
+	MemHostReadOnly            MemFlag         = C.CL_MEM_HOST_READ_ONLY  // OpenCL 1.2
+	MemHostWriteOnly           MemFlag         = C.CL_MEM_HOST_WRITE_ONLY // OpenCL 1.2
+	MemObjectTypeImage1D       MemObjectType   = C.CL_MEM_OBJECT_IMAGE1D
+	MemObjectTypeImage1DArray  MemObjectType   = C.CL_MEM_OBJECT_IMAGE1D_ARRAY
+	MemObjectTypeImage1DBuffer MemObjectType   = C.CL_MEM_OBJECT_IMAGE1D_BUFFER
+	MemObjectTypeImage2DArray  MemObjectType   = C.CL_MEM_OBJECT_IMAGE2D_ARRAY
+	// This flag specifies that the region being mapped in the memory object is being mapped for writing.
+	//
+	// The contents of the region being mapped are to be discarded. This is typically the case when the
+	// region being mapped is overwritten by the host. This flag allows the implementation to no longer
+	// guarantee that the pointer returned by clEnqueueMapBuffer or clEnqueueMapImage contains the
+	// latest bits in the region being mapped which can be a significant performance enhancement.
+	MapFlagWriteInvalidateRegion MapFlag = C.CL_MAP_WRITE_INVALIDATE_REGION
+)
+
+func init() {
+	errorMap[C.CL_COMPILE_PROGRAM_FAILURE] = ErrCompileProgramFailure
+	errorMap[C.CL_DEVICE_PARTITION_FAILED] = ErrDevicePartitionFailed
+	errorMap[C.CL_INVALID_COMPILER_OPTIONS] = ErrInvalidCompilerOptions
+	errorMap[C.CL_INVALID_DEVICE_PARTITION_COUNT] = ErrInvalidDevicePartitionCount
+	errorMap[C.CL_INVALID_IMAGE_DESCRIPTOR] = ErrInvalidImageDescriptor
+	errorMap[C.CL_INVALID_LINKER_OPTIONS] = ErrInvalidLinkerOptions
+	errorMap[C.CL_KERNEL_ARG_INFO_NOT_AVAILABLE] = ErrKernelArgInfoNotAvailable
+	errorMap[C.CL_LINK_PROGRAM_FAILURE] = ErrLinkProgramFailure
+	errorMap[C.CL_LINKER_NOT_AVAILABLE] = ErrLinkerNotAvailable
+	channelOrderNameMap[ChannelOrderDepth] = "Depth"
+	channelOrderNameMap[ChannelOrderDepthStencil] = "DepthStencil"
+	channelDataTypeNameMap[ChannelDataTypeUNormInt24] = "UNormInt24"
+}
+
+type ImageDescription struct {
+	Type                            MemObjectType
+	Width, Height, Depth            int
+	ArraySize, RowPitch, SlicePitch int
+	NumMipLevels, NumSamples        int
+	Buffer                          *MemObject
+}
+
+func (d ImageDescription) toCl() C.cl_image_desc {
+	var desc C.cl_image_desc
+	desc.image_type = C.cl_mem_object_type(d.Type)
+	desc.image_width = C.size_t(d.Width)
+	desc.image_height = C.size_t(d.Height)
+	desc.image_depth = C.size_t(d.Depth)
+	desc.image_array_size = C.size_t(d.ArraySize)
+	desc.image_row_pitch = C.size_t(d.RowPitch)
+	desc.image_slice_pitch = C.size_t(d.SlicePitch)
+	desc.num_mip_levels = C.cl_uint(d.NumMipLevels)
+	desc.num_samples = C.cl_uint(d.NumSamples)
+	desc.buffer = nil
+	if d.Buffer != nil {
+		desc.buffer = d.Buffer.clMem
+	}
+	return desc
+}
--- a/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/types_darwin.go
+++ b/Godeps/_workspace/src/github.com/Gustav-Simonsson/go-opencl/cl/types_darwin.go
@ -0,0 +1,45 @@
+package cl
+
+// #ifdef __APPLE__
+// #include "OpenCL/opencl.h"
+// #else
+// #include "cl.h"
+// #endif
+import "C"
+
+// Extension: cl_APPLE_fixed_alpha_channel_orders
+//
+// These selectors may be passed to clCreateImage2D() in the cl_image_format.image_channel_order field.
+// They are like CL_BGRA and CL_ARGB except that the alpha channel to be ignored.  On calls to read_imagef,
+// the alpha will be 0xff (1.0f) if the sample falls in the image and 0 if it does not fall in the image.
+// On calls to write_imagef, the alpha value is ignored and 0xff (1.0f) is written. These formats are
+// currently only available for the CL_UNORM_INT8 cl_channel_type. They are intended to support legacy
+// image formats.
+const (
+	ChannelOrder1RGBApple ChannelOrder = C.CL_1RGB_APPLE // Introduced in MacOS X.7.
+	ChannelOrderBGR1Apple ChannelOrder = C.CL_BGR1_APPLE // Introduced in MacOS X.7.
+)
+
+// Extension: cl_APPLE_biased_fixed_point_image_formats
+//
+// This selector may be passed to clCreateImage2D() in the cl_image_format.image_channel_data_type field.
+// It defines a biased signed 1.14 fixed point storage format, with range [-1, 3). The conversion from
+// float to this fixed point format is defined as follows:
+//
+//      ushort float_to_sfixed14( float x ){
+//          int i = convert_int_sat_rte( x * 0x1.0p14f );         // scale [-1, 3.0) to [-16384, 3*16384), round to nearest integer
+//          i = add_sat( i, 0x4000 );                             // apply bias, to convert to [0, 65535) range
+//          return convert_ushort_sat(i);                         // clamp to destination size
+//      }
+//
+// The inverse conversion is the reverse process. The formats are currently only available on the CPU with
+// the CL_RGBA channel layout.
+const (
+	ChannelDataTypeSFixed14Apple ChannelDataType = C.CL_SFIXED14_APPLE // Introduced in MacOS X.7.
+)
+
+func init() {
+	channelOrderNameMap[ChannelOrder1RGBApple] = "1RGBApple"
+	channelOrderNameMap[ChannelOrderBGR1Apple] = "RGB1Apple"
+	channelDataTypeNameMap[ChannelDataTypeSFixed14Apple] = "SFixed14Apple"
+}
--- a/Godeps/_workspace/src/github.com/ethereum/ethash/ethash.go
+++ b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash.go
@ -30,8 +30,8 @@ import (
 )

 var (
-	minDifficulty = new(big.Int).Exp(big.NewInt(2), big.NewInt(256), big.NewInt(0))
-	sharedLight   = new(Light)
+	maxUint256  = new(big.Int).Exp(big.NewInt(2), big.NewInt(256), big.NewInt(0))
+	sharedLight = new(Light)
 )

 const (
@ -140,7 +140,7 @@ func (l *Light) Verify(block pow.Block) bool {
 	// the finalizer before the call completes.
 	_ = cache
 	// The actual check.
-	target := new(big.Int).Div(minDifficulty, difficulty)
+	target := new(big.Int).Div(maxUint256, difficulty)
 	return h256ToHash(ret.result).Big().Cmp(target) <= 0
 }

@ -199,7 +199,7 @@ func (d *dag) generate() {
 		if d.dir == "" {
 			d.dir = DefaultDir
 		}
-		glog.V(logger.Info).Infof("Generating DAG for epoch %d (%x)", d.epoch, seedHash)
+		glog.V(logger.Info).Infof("Generating DAG for epoch %d (size %d) (%x)", d.epoch, dagSize, seedHash)
 		// Generate a temporary cache.
 		// TODO: this could share the cache with Light
 		cache := C.ethash_light_new_internal(cacheSize, (*C.ethash_h256_t)(unsafe.Pointer(&seedHash[0])))
@ -220,14 +220,18 @@ func (d *dag) generate() {
 	})
 }

-func freeDAG(h *dag) {
-	C.ethash_full_delete(h.ptr)
-	h.ptr = nil
+func freeDAG(d *dag) {
+	C.ethash_full_delete(d.ptr)
+	d.ptr = nil
+}
+
+func (d *dag) Ptr() unsafe.Pointer {
+	return unsafe.Pointer(d.ptr.data)
 }

 //export ethashGoCallback
 func ethashGoCallback(percent C.unsigned) C.int {
-	glog.V(logger.Info).Infof("Still generating DAG: %d%%", percent)
+	glog.V(logger.Info).Infof("Generating DAG: %d%%", percent)
 	return 0
 }

@ -273,7 +277,7 @@ func (pow *Full) getDAG(blockNum uint64) (d *dag) {
 	return d
 }

-func (pow *Full) Search(block pow.Block, stop <-chan struct{}) (nonce uint64, mixDigest []byte) {
+func (pow *Full) Search(block pow.Block, stop <-chan struct{}, index int) (nonce uint64, mixDigest []byte) {
 	dag := pow.getDAG(block.NumberU64())

 	r := rand.New(rand.NewSource(time.Now().UnixNano()))
@ -286,7 +290,7 @@ func (pow *Full) Search(block pow.Block, stop <-chan struct{}) (nonce uint64, mi

 	nonce = uint64(r.Int63())
 	hash := hashToH256(block.HashNoNonce())
-	target := new(big.Int).Div(minDifficulty, diff)
+	target := new(big.Int).Div(maxUint256, diff)
 	for {
 		select {
 		case <-stop:
--- a/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl.go
+++ b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl.go
@ -0,0 +1,629 @@
+// Copyright 2014 The go-ethereum Authors
+// This file is part of the go-ethereum library.
+//
+// The go-ethereum library is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// The go-ethereum library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
+
+// +build opencl
+
+package ethash
+
+//#cgo LDFLAGS: -w
+//#include <stdint.h>
+//#include <string.h>
+//#include "src/libethash/internal.h"
+import "C"
+
+import (
+	crand "crypto/rand"
+	"encoding/binary"
+	"fmt"
+	"math"
+	"math/big"
+	mrand "math/rand"
+	"strconv"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"time"
+	"unsafe"
+
+	"github.com/Gustav-Simonsson/go-opencl/cl"
+	"github.com/ethereum/go-ethereum/common"
+	"github.com/ethereum/go-ethereum/pow"
+)
+
+/*
+
+  This code have two main entry points:
+
+  1. The initCL(...)  function configures one or more OpenCL device
+     (for now only GPU) and loads the Ethash DAG onto device memory
+
+  2. The Search(...) function loads a Ethash nonce into device(s) memory and
+     executes the Ethash OpenCL kernel.
+
+  Throughout the code, we refer to "host memory" and "device memory".
+  For most systems (e.g. regular PC GPU miner) the host memory is RAM and
+  device memory is the GPU global memory (e.g. GDDR5).
+
+  References mentioned in code comments:
+
+  1. https://github.com/ethereum/wiki/wiki/Ethash
+  2. https://github.com/ethereum/cpp-ethereum/blob/develop/libethash-cl/ethash_cl_miner.cpp
+  3. https://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/
+  4. http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/12/AMD_OpenCL_Programming_User_Guide.pdf
+
+*/
+
+type OpenCLDevice struct {
+	deviceId int
+	device   *cl.Device
+	openCL11 bool // OpenCL version 1.1 and 1.2 are handled a bit different
+	openCL12 bool
+
+	dagBuf        *cl.MemObject // Ethash full DAG in device mem
+	headerBuf     *cl.MemObject // Hash of block-to-mine in device mem
+	searchBuffers []*cl.MemObject
+
+	searchKernel *cl.Kernel
+	hashKernel   *cl.Kernel
+
+	queue         *cl.CommandQueue
+	ctx           *cl.Context
+	workGroupSize int
+
+	nonceRand *mrand.Rand // seeded by crypto/rand, see comments where it's initialised
+	result    common.Hash
+}
+
+type OpenCLMiner struct {
+	mu sync.Mutex
+
+	ethash *Ethash // Ethash full DAG & cache in host mem
+
+	deviceIds []int
+	devices   []*OpenCLDevice
+
+	dagSize uint64
+
+	hashRate int32 // Go atomics & uint64 have some issues; int32 is supported on all platforms
+}
+
+type pendingSearch struct {
+	bufIndex   uint32
+	startNonce uint64
+}
+
+const (
+	SIZEOF_UINT32 = 4
+
+	// See [1]
+	ethashMixBytesLen = 128
+	ethashAccesses    = 64
+
+	// See [4]
+	workGroupSize    = 32 // must be multiple of 8
+	maxSearchResults = 63
+	searchBufSize    = 2
+	globalWorkSize   = 1024 * 256
+)
+
+func NewCL(deviceIds []int) *OpenCLMiner {
+	ids := make([]int, len(deviceIds))
+	copy(ids, deviceIds)
+	return &OpenCLMiner{
+		ethash:    New(),
+		dagSize:   0, // to see if we need to update DAG.
+		deviceIds: ids,
+	}
+}
+
+func PrintDevices() {
+	fmt.Println("=============================================")
+	fmt.Println("============ OpenCL Device Info =============")
+	fmt.Println("=============================================")
+
+	var found []*cl.Device
+
+	platforms, err := cl.GetPlatforms()
+	if err != nil {
+		fmt.Println("Plaform error (check your OpenCL installation): %v", err)
+		return
+	}
+
+	for i, p := range platforms {
+		fmt.Println("Platform id             ", i)
+		fmt.Println("Platform Name           ", p.Name())
+		fmt.Println("Platform Vendor         ", p.Vendor())
+		fmt.Println("Platform Version        ", p.Version())
+		fmt.Println("Platform Extensions     ", p.Extensions())
+		fmt.Println("Platform Profile        ", p.Profile())
+		fmt.Println("")
+
+		devices, err := cl.GetDevices(p, cl.DeviceTypeGPU)
+		if err != nil {
+			fmt.Println("Device error (check your GPU drivers) :", err)
+			return
+		}
+
+		for _, d := range devices {
+			fmt.Println("Device OpenCL id        ", i)
+			fmt.Println("Device id for mining    ", len(found))
+			fmt.Println("Device Name             ", d.Name())
+			fmt.Println("Vendor                  ", d.Vendor())
+			fmt.Println("Version                 ", d.Version())
+			fmt.Println("Driver version          ", d.DriverVersion())
+			fmt.Println("Address bits            ", d.AddressBits())
+			fmt.Println("Max clock freq          ", d.MaxClockFrequency())
+			fmt.Println("Global mem size         ", d.GlobalMemSize())
+			fmt.Println("Max constant buffer size", d.MaxConstantBufferSize())
+			fmt.Println("Max mem alloc size      ", d.MaxMemAllocSize())
+			fmt.Println("Max compute units       ", d.MaxComputeUnits())
+			fmt.Println("Max work group size     ", d.MaxWorkGroupSize())
+			fmt.Println("Max work item sizes     ", d.MaxWorkItemSizes())
+			fmt.Println("=============================================")
+
+			found = append(found, d)
+		}
+	}
+	if len(found) == 0 {
+		fmt.Println("Found no GPU(s). Check that your OS can see the GPU(s)")
+	} else {
+		var idsFormat string
+		for i := 0; i < len(found); i++ {
+			idsFormat += strconv.Itoa(i)
+			if i != len(found)-1 {
+				idsFormat += ","
+			}
+		}
+		fmt.Printf("Found %v devices. Benchmark first GPU:       geth gpubench 0\n", len(found))
+		fmt.Printf("Mine using all GPUs:                        geth --minegpu %v\n", idsFormat)
+	}
+}
+
+// See [2]. We basically do the same here, but the Go OpenCL bindings
+// are at a slightly higher abtraction level.
+func InitCL(blockNum uint64, c *OpenCLMiner) error {
+	platforms, err := cl.GetPlatforms()
+	if err != nil {
+		return fmt.Errorf("Plaform error: %v\nCheck your OpenCL installation and then run geth gpuinfo", err)
+	}
+
+	var devices []*cl.Device
+	for _, p := range platforms {
+		ds, err := cl.GetDevices(p, cl.DeviceTypeGPU)
+		if err != nil {
+			return fmt.Errorf("Devices error: %v\nCheck your GPU drivers and then run geth gpuinfo", err)
+		}
+		for _, d := range ds {
+			devices = append(devices, d)
+		}
+	}
+
+	pow := New()
+	_ = pow.getDAG(blockNum)     // generates DAG if we don't have it
+	pow.Light.getCache(blockNum) // and cache
+
+	c.ethash = pow
+	dagSize := uint64(C.ethash_get_datasize(C.uint64_t(blockNum)))
+	c.dagSize = dagSize
+
+	for _, id := range c.deviceIds {
+		if id > len(devices)-1 {
+			return fmt.Errorf("Device id not found. See available device ids with: geth gpuinfo")
+		} else {
+			err := initCLDevice(id, devices[id], c)
+			if err != nil {
+				return err
+			}
+		}
+	}
+	if len(c.devices) == 0 {
+		return fmt.Errorf("No GPU devices found")
+	}
+	return nil
+}
+
+func initCLDevice(deviceId int, device *cl.Device, c *OpenCLMiner) error {
+	devMaxAlloc := uint64(device.MaxMemAllocSize())
+	devGlobalMem := uint64(device.GlobalMemSize())
+
+	// TODO: more fine grained version logic
+	if device.Version() == "OpenCL 1.0" {
+		fmt.Println("Device OpenCL version not supported: ", device.Version())
+		return fmt.Errorf("opencl version not supported")
+	}
+
+	var cl11, cl12 bool
+	if device.Version() == "OpenCL 1.1" {
+		cl11 = true
+	}
+	if device.Version() == "OpenCL 1.2" {
+		cl12 = true
+	}
+
+	// log warnings but carry on; some device drivers report inaccurate values
+	if c.dagSize > devGlobalMem {
+		fmt.Printf("WARNING: device memory may be insufficient: %v. DAG size: %v.\n", devGlobalMem, c.dagSize)
+	}
+
+	if c.dagSize > devMaxAlloc {
+		fmt.Printf("WARNING: DAG size (%v) larger than device max memory allocation size (%v).\n", c.dagSize, devMaxAlloc)
+		fmt.Printf("You probably have to export GPU_MAX_ALLOC_PERCENT=95\n")
+	}
+
+	fmt.Printf("Initialising device %v: %v\n", deviceId, device.Name())
+
+	context, err := cl.CreateContext([]*cl.Device{device})
+	if err != nil {
+		return fmt.Errorf("failed creating context:", err)
+	}
+
+	// TODO: test running with CL_QUEUE_PROFILING_ENABLE for profiling?
+	queue, err := context.CreateCommandQueue(device, 0)
+	if err != nil {
+		return fmt.Errorf("command queue err:", err)
+	}
+
+	// See [4] section 3.2 and [3] "clBuildProgram".
+	// The OpenCL kernel code is compiled at run-time.
+	kvs := make(map[string]string, 4)
+	kvs["GROUP_SIZE"] = strconv.FormatUint(workGroupSize, 10)
+	kvs["DAG_SIZE"] = strconv.FormatUint(c.dagSize/ethashMixBytesLen, 10)
+	kvs["ACCESSES"] = strconv.FormatUint(ethashAccesses, 10)
+	kvs["MAX_OUTPUTS"] = strconv.FormatUint(maxSearchResults, 10)
+	kernelCode := replaceWords(kernel, kvs)
+
+	program, err := context.CreateProgramWithSource([]string{kernelCode})
+	if err != nil {
+		return fmt.Errorf("program err:", err)
+	}
+
+	/* if using AMD OpenCL impl, you can set this to debug on x86 CPU device.
+	   see AMD OpenCL programming guide section 4.2
+
+	   export in shell before running:
+	   export AMD_OCL_BUILD_OPTIONS_APPEND="-g -O0"
+	   export CPU_MAX_COMPUTE_UNITS=1
+
+	buildOpts := "-g -cl-opt-disable"
+
+	*/
+	buildOpts := ""
+	err = program.BuildProgram([]*cl.Device{device}, buildOpts)
+	if err != nil {
+		return fmt.Errorf("program build err:", err)
+	}
+
+	var searchKernelName, hashKernelName string
+	searchKernelName = "ethash_search"
+	hashKernelName = "ethash_hash"
+
+	searchKernel, err := program.CreateKernel(searchKernelName)
+	hashKernel, err := program.CreateKernel(hashKernelName)
+	if err != nil {
+		return fmt.Errorf("kernel err:", err)
+	}
+
+	// TODO: when this DAG size appears, patch the Go bindings
+	// (context.go) to work with uint64 as size_t
+	if c.dagSize > math.MaxInt32 {
+		fmt.Println("DAG too large for allocation.")
+		return fmt.Errorf("DAG too large for alloc")
+	}
+
+	// TODO: patch up Go bindings to work with size_t, will overflow if > maxint32
+	// TODO: fuck. shit's gonna overflow around 2017-06-09 12:17:02
+	dagBuf := *(new(*cl.MemObject))
+	dagBuf, err = context.CreateEmptyBuffer(cl.MemReadOnly, int(c.dagSize))
+	if err != nil {
+		return fmt.Errorf("allocating dag buf failed: ", err)
+	}
+
+	// write DAG to device mem
+	dagPtr := unsafe.Pointer(c.ethash.Full.current.ptr.data)
+	_, err = queue.EnqueueWriteBuffer(dagBuf, true, 0, int(c.dagSize), dagPtr, nil)
+	if err != nil {
+		return fmt.Errorf("writing to dag buf failed: ", err)
+	}
+
+	searchBuffers := make([]*cl.MemObject, searchBufSize)
+	for i := 0; i < searchBufSize; i++ {
+		searchBuff, err := context.CreateEmptyBuffer(cl.MemWriteOnly, (1+maxSearchResults)*SIZEOF_UINT32)
+		if err != nil {
+			return fmt.Errorf("search buffer err:", err)
+		}
+		searchBuffers[i] = searchBuff
+	}
+
+	headerBuf, err := context.CreateEmptyBuffer(cl.MemReadOnly, 32)
+	if err != nil {
+		return fmt.Errorf("header buffer err:", err)
+	}
+
+	// Unique, random nonces are crucial for mining efficieny.
+	// While we do not need cryptographically secure PRNG for nonces,
+	// we want to have uniform distribution and minimal repetition of nonces.
+	// We could guarantee strict uniqueness of nonces by generating unique ranges,
+	// but a int64 seed from crypto/rand should be good enough.
+	// we then use math/rand for speed and to avoid draining OS entropy pool
+	seed, err := crand.Int(crand.Reader, big.NewInt(math.MaxInt64))
+	if err != nil {
+		return err
+	}
+	nonceRand := mrand.New(mrand.NewSource(seed.Int64()))
+
+	deviceStruct := &OpenCLDevice{
+		deviceId: deviceId,
+		device:   device,
+		openCL11: cl11,
+		openCL12: cl12,
+
+		dagBuf:        dagBuf,
+		headerBuf:     headerBuf,
+		searchBuffers: searchBuffers,
+
+		searchKernel: searchKernel,
+		hashKernel:   hashKernel,
+
+		queue: queue,
+		ctx:   context,
+
+		workGroupSize: workGroupSize,
+
+		nonceRand: nonceRand,
+	}
+	c.devices = append(c.devices, deviceStruct)
+
+	return nil
+}
+
+func (c *OpenCLMiner) Search(block pow.Block, stop <-chan struct{}, index int) (uint64, []byte) {
+	c.mu.Lock()
+	newDagSize := uint64(C.ethash_get_datasize(C.uint64_t(block.NumberU64())))
+	if newDagSize > c.dagSize {
+		// TODO: clean up buffers from previous DAG?
+		err := InitCL(block.NumberU64(), c)
+		if err != nil {
+			fmt.Println("OpenCL init error: ", err)
+			return 0, []byte{0}
+		}
+	}
+	defer c.mu.Unlock()
+
+	// Avoid unneeded OpenCL initialisation if we received stop while running InitCL
+	select {
+	case <-stop:
+		return 0, []byte{0}
+	default:
+	}
+
+	headerHash := block.HashNoNonce()
+	diff := block.Difficulty()
+	target256 := new(big.Int).Div(maxUint256, diff)
+	target64 := new(big.Int).Rsh(target256, 192).Uint64()
+	var zero uint32 = 0
+
+	d := c.devices[index]
+
+	_, err := d.queue.EnqueueWriteBuffer(d.headerBuf, false, 0, 32, unsafe.Pointer(&headerHash[0]), nil)
+	if err != nil {
+		fmt.Println("Error in Search clEnqueueWriterBuffer : ", err)
+		return 0, []byte{0}
+	}
+
+	for i := 0; i < searchBufSize; i++ {
+		_, err := d.queue.EnqueueWriteBuffer(d.searchBuffers[i], false, 0, 4, unsafe.Pointer(&zero), nil)
+		if err != nil {
+			fmt.Println("Error in Search clEnqueueWriterBuffer : ", err)
+			return 0, []byte{0}
+		}
+	}
+
+	// wait for all search buffers to complete
+	err = d.queue.Finish()
+	if err != nil {
+		fmt.Println("Error in Search clFinish : ", err)
+		return 0, []byte{0}
+	}
+
+	err = d.searchKernel.SetArg(1, d.headerBuf)
+	if err != nil {
+		fmt.Println("Error in Search clSetKernelArg : ", err)
+		return 0, []byte{0}
+	}
+
+	err = d.searchKernel.SetArg(2, d.dagBuf)
+	if err != nil {
+		fmt.Println("Error in Search clSetKernelArg : ", err)
+		return 0, []byte{0}
+	}
+
+	err = d.searchKernel.SetArg(4, target64)
+	if err != nil {
+		fmt.Println("Error in Search clSetKernelArg : ", err)
+		return 0, []byte{0}
+	}
+	err = d.searchKernel.SetArg(5, uint32(math.MaxUint32))
+	if err != nil {
+		fmt.Println("Error in Search clSetKernelArg : ", err)
+		return 0, []byte{0}
+	}
+
+	// wait on this before returning
+	var preReturnEvent *cl.Event
+	if d.openCL12 {
+		preReturnEvent, err = d.ctx.CreateUserEvent()
+		if err != nil {
+			fmt.Println("Error in Search create CL user event : ", err)
+			return 0, []byte{0}
+		}
+	}
+
+	pending := make([]pendingSearch, 0, searchBufSize)
+	var p *pendingSearch
+	searchBufIndex := uint32(0)
+	var checkNonce uint64
+	loops := int64(0)
+	prevHashRate := int32(0)
+	start := time.Now().UnixNano()
+	// we grab a single random nonce and sets this as argument to the kernel search function
+	// the device will then add each local threads gid to the nonce, creating a unique nonce
+	// for each device computing unit executing in parallel
+	initNonce := uint64(d.nonceRand.Int63())
+	for nonce := initNonce; ; nonce += uint64(globalWorkSize) {
+		select {
+		case <-stop:
+
+			/*
+				if d.openCL12 {
+					err = cl.WaitForEvents([]*cl.Event{preReturnEvent})
+					if err != nil {
+						fmt.Println("Error in Search WaitForEvents: ", err)
+					}
+				}
+			*/
+
+			atomic.AddInt32(&c.hashRate, -prevHashRate)
+			return 0, []byte{0}
+		default:
+		}
+
+		if (loops % (1 << 7)) == 0 {
+			elapsed := time.Now().UnixNano() - start
+			// TODO: verify if this is correct hash rate calculation
+			hashes := (float64(1e9) / float64(elapsed)) * float64(loops*1024*256)
+			hashrateDiff := int32(hashes) - prevHashRate
+			prevHashRate = int32(hashes)
+			atomic.AddInt32(&c.hashRate, hashrateDiff)
+		}
+		loops++
+
+		err = d.searchKernel.SetArg(0, d.searchBuffers[searchBufIndex])
+		if err != nil {
+			fmt.Println("Error in Search clSetKernelArg : ", err)
+			return 0, []byte{0}
+		}
+		err = d.searchKernel.SetArg(3, nonce)
+		if err != nil {
+			fmt.Println("Error in Search clSetKernelArg : ", err)
+			return 0, []byte{0}
+		}
+
+		// execute kernel
+		_, err := d.queue.EnqueueNDRangeKernel(
+			d.searchKernel,
+			[]int{0},
+			[]int{globalWorkSize},
+			[]int{d.workGroupSize},
+			nil)
+		if err != nil {
+			fmt.Println("Error in Search clEnqueueNDRangeKernel : ", err)
+			return 0, []byte{0}
+		}
+
+		pending = append(pending, pendingSearch{bufIndex: searchBufIndex, startNonce: nonce})
+		searchBufIndex = (searchBufIndex + 1) % searchBufSize
+
+		if len(pending) == searchBufSize {
+			p = &(pending[searchBufIndex])
+			cres, _, err := d.queue.EnqueueMapBuffer(d.searchBuffers[p.bufIndex], true,
+				cl.MapFlagRead, 0, (1+maxSearchResults)*SIZEOF_UINT32,
+				nil)
+			if err != nil {
+				fmt.Println("Error in Search clEnqueueMapBuffer: ", err)
+				return 0, []byte{0}
+			}
+
+			results := cres.ByteSlice()
+			nfound := binary.LittleEndian.Uint32(results)
+			nfound = uint32(math.Min(float64(nfound), float64(maxSearchResults)))
+			// OpenCL returns the offsets from the start nonce
+			for i := uint32(0); i < nfound; i++ {
+				lo := (i + 1) * SIZEOF_UINT32
+				hi := (i + 2) * SIZEOF_UINT32
+				upperNonce := uint64(binary.LittleEndian.Uint32(results[lo:hi]))
+				checkNonce = p.startNonce + upperNonce
+				if checkNonce != 0 {
+					cn := C.uint64_t(checkNonce)
+					ds := C.uint64_t(c.dagSize)
+					// We verify that the nonce is indeed a solution by
+					// executing the Ethash verification function (on the CPU).
+					ret := C.ethash_light_compute_internal(c.ethash.Light.current.ptr, ds, hashToH256(headerHash), cn)
+					// TODO: return result first
+					if ret.success && h256ToHash(ret.result).Big().Cmp(target256) <= 0 {
+						_, err = d.queue.EnqueueUnmapMemObject(d.searchBuffers[p.bufIndex], cres, nil)
+						if err != nil {
+							fmt.Println("Error in Search clEnqueueUnmapMemObject: ", err)
+						}
+						if d.openCL12 {
+							err = cl.WaitForEvents([]*cl.Event{preReturnEvent})
+							if err != nil {
+								fmt.Println("Error in Search WaitForEvents: ", err)
+							}
+						}
+						return checkNonce, C.GoBytes(unsafe.Pointer(&ret.mix_hash), C.int(32))
+					}
+
+					_, err := d.queue.EnqueueWriteBuffer(d.searchBuffers[p.bufIndex], false, 0, 4, unsafe.Pointer(&zero), nil)
+					if err != nil {
+						fmt.Println("Error in Search cl: EnqueueWriteBuffer", err)
+						return 0, []byte{0}
+					}
+				}
+			}
+			_, err = d.queue.EnqueueUnmapMemObject(d.searchBuffers[p.bufIndex], cres, nil)
+			if err != nil {
+				fmt.Println("Error in Search clEnqueueUnMapMemObject: ", err)
+				return 0, []byte{0}
+			}
+			pending = append(pending[:searchBufIndex], pending[searchBufIndex+1:]...)
+		}
+	}
+	if d.openCL12 {
+		err := cl.WaitForEvents([]*cl.Event{preReturnEvent})
+		if err != nil {
+			fmt.Println("Error in Search clWaitForEvents: ", err)
+			return 0, []byte{0}
+		}
+	}
+	return 0, []byte{0}
+}
+
+func (c *OpenCLMiner) Verify(block pow.Block) bool {
+	return c.ethash.Light.Verify(block)
+}
+func (c *OpenCLMiner) GetHashrate() int64 {
+	return int64(atomic.LoadInt32(&c.hashRate))
+}
+func (c *OpenCLMiner) Turbo(on bool) {
+	// This is GPU mining. Always be turbo.
+}
+
+func replaceWords(text string, kvs map[string]string) string {
+	for k, v := range kvs {
+		text = strings.Replace(text, k, v, -1)
+	}
+	return text
+}
+
+func logErr(err error) {
+	if err != nil {
+		fmt.Println("Error in OpenCL call:", err)
+	}
+}
+
+func argErr(err error) error {
+	return fmt.Errorf("arg err: %v", err)
+}
--- a/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl_kernel_go_str.go
+++ b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_opencl_kernel_go_str.go
@ -0,0 +1,600 @@
+package ethash
+
+/*  DO NOT EDIT!!!
+
+    This code is version controlled at
+    https://github.com/ethereum/cpp-ethereum/blob/develop/libethash-cl/ethash_cl_miner_kernel.cl
+
+    If needed change it there first, then copy over here.
+*/
+
+const kernel = `
+// author Tim Hughes <tim@twistedfury.com>
+// Tested on Radeon HD 7850
+// Hashrate: 15940347 hashes/s
+// Bandwidth: 124533 MB/s
+// search kernel should fit in <= 84 VGPRS (3 wavefronts)
+
+#define THREADS_PER_HASH (128 / 16)
+#define HASHES_PER_LOOP (GROUP_SIZE / THREADS_PER_HASH)
+
+#define FNV_PRIME	0x01000193
+
+__constant uint2 const Keccak_f1600_RC[24] = {
+	(uint2)(0x00000001, 0x00000000),
+	(uint2)(0x00008082, 0x00000000),
+	(uint2)(0x0000808a, 0x80000000),
+	(uint2)(0x80008000, 0x80000000),
+	(uint2)(0x0000808b, 0x00000000),
+	(uint2)(0x80000001, 0x00000000),
+	(uint2)(0x80008081, 0x80000000),
+	(uint2)(0x00008009, 0x80000000),
+	(uint2)(0x0000008a, 0x00000000),
+	(uint2)(0x00000088, 0x00000000),
+	(uint2)(0x80008009, 0x00000000),
+	(uint2)(0x8000000a, 0x00000000),
+	(uint2)(0x8000808b, 0x00000000),
+	(uint2)(0x0000008b, 0x80000000),
+	(uint2)(0x00008089, 0x80000000),
+	(uint2)(0x00008003, 0x80000000),
+	(uint2)(0x00008002, 0x80000000),
+	(uint2)(0x00000080, 0x80000000),
+	(uint2)(0x0000800a, 0x00000000),
+	(uint2)(0x8000000a, 0x80000000),
+	(uint2)(0x80008081, 0x80000000),
+	(uint2)(0x00008080, 0x80000000),
+	(uint2)(0x80000001, 0x00000000),
+	(uint2)(0x80008008, 0x80000000),
+};
+
+void keccak_f1600_round(uint2* a, uint r, uint out_size)
+{
+   #if !__ENDIAN_LITTLE__
+	for (uint i = 0; i != 25; ++i)
+		a[i] = a[i].yx;
+   #endif
+
+	uint2 b[25];
+	uint2 t;
+
+	// Theta
+	b[0] = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20];
+	b[1] = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21];
+	b[2] = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22];
+	b[3] = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23];
+	b[4] = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24];
+	t = b[4] ^ (uint2)(b[1].x << 1 | b[1].y >> 31, b[1].y << 1 | b[1].x >> 31);
+	a[0] ^= t;
+	a[5] ^= t;
+	a[10] ^= t;
+	a[15] ^= t;
+	a[20] ^= t;
+	t = b[0] ^ (uint2)(b[2].x << 1 | b[2].y >> 31, b[2].y << 1 | b[2].x >> 31);
+	a[1] ^= t;
+	a[6] ^= t;
+	a[11] ^= t;
+	a[16] ^= t;
+	a[21] ^= t;
+	t = b[1] ^ (uint2)(b[3].x << 1 | b[3].y >> 31, b[3].y << 1 | b[3].x >> 31);
+	a[2] ^= t;
+	a[7] ^= t;
+	a[12] ^= t;
+	a[17] ^= t;
+	a[22] ^= t;
+	t = b[2] ^ (uint2)(b[4].x << 1 | b[4].y >> 31, b[4].y << 1 | b[4].x >> 31);
+	a[3] ^= t;
+	a[8] ^= t;
+	a[13] ^= t;
+	a[18] ^= t;
+	a[23] ^= t;
+	t = b[3] ^ (uint2)(b[0].x << 1 | b[0].y >> 31, b[0].y << 1 | b[0].x >> 31);
+	a[4] ^= t;
+	a[9] ^= t;
+	a[14] ^= t;
+	a[19] ^= t;
+	a[24] ^= t;
+
+	// Rho Pi
+	b[0] = a[0];
+	b[10] = (uint2)(a[1].x << 1 | a[1].y >> 31, a[1].y << 1 | a[1].x >> 31);
+	b[7] = (uint2)(a[10].x << 3 | a[10].y >> 29, a[10].y << 3 | a[10].x >> 29);
+	b[11] = (uint2)(a[7].x << 6 | a[7].y >> 26, a[7].y << 6 | a[7].x >> 26);
+	b[17] = (uint2)(a[11].x << 10 | a[11].y >> 22, a[11].y << 10 | a[11].x >> 22);
+	b[18] = (uint2)(a[17].x << 15 | a[17].y >> 17, a[17].y << 15 | a[17].x >> 17);
+	b[3] = (uint2)(a[18].x << 21 | a[18].y >> 11, a[18].y << 21 | a[18].x >> 11);
+	b[5] = (uint2)(a[3].x << 28 | a[3].y >> 4, a[3].y << 28 | a[3].x >> 4);
+	b[16] = (uint2)(a[5].y << 4 | a[5].x >> 28, a[5].x << 4 | a[5].y >> 28);
+	b[8] = (uint2)(a[16].y << 13 | a[16].x >> 19, a[16].x << 13 | a[16].y >> 19);
+	b[21] = (uint2)(a[8].y << 23 | a[8].x >> 9, a[8].x << 23 | a[8].y >> 9);
+	b[24] = (uint2)(a[21].x << 2 | a[21].y >> 30, a[21].y << 2 | a[21].x >> 30);
+	b[4] = (uint2)(a[24].x << 14 | a[24].y >> 18, a[24].y << 14 | a[24].x >> 18);
+	b[15] = (uint2)(a[4].x << 27 | a[4].y >> 5, a[4].y << 27 | a[4].x >> 5);
+	b[23] = (uint2)(a[15].y << 9 | a[15].x >> 23, a[15].x << 9 | a[15].y >> 23);
+	b[19] = (uint2)(a[23].y << 24 | a[23].x >> 8, a[23].x << 24 | a[23].y >> 8);
+	b[13] = (uint2)(a[19].x << 8 | a[19].y >> 24, a[19].y << 8 | a[19].x >> 24);
+	b[12] = (uint2)(a[13].x << 25 | a[13].y >> 7, a[13].y << 25 | a[13].x >> 7);
+	b[2] = (uint2)(a[12].y << 11 | a[12].x >> 21, a[12].x << 11 | a[12].y >> 21);
+	b[20] = (uint2)(a[2].y << 30 | a[2].x >> 2, a[2].x << 30 | a[2].y >> 2);
+	b[14] = (uint2)(a[20].x << 18 | a[20].y >> 14, a[20].y << 18 | a[20].x >> 14);
+	b[22] = (uint2)(a[14].y << 7 | a[14].x >> 25, a[14].x << 7 | a[14].y >> 25);
+	b[9] = (uint2)(a[22].y << 29 | a[22].x >> 3, a[22].x << 29 | a[22].y >> 3);
+	b[6] = (uint2)(a[9].x << 20 | a[9].y >> 12, a[9].y << 20 | a[9].x >> 12);
+	b[1] = (uint2)(a[6].y << 12 | a[6].x >> 20, a[6].x << 12 | a[6].y >> 20);
+
+	// Chi
+	a[0] = bitselect(b[0] ^ b[2], b[0], b[1]);
+	a[1] = bitselect(b[1] ^ b[3], b[1], b[2]);
+	a[2] = bitselect(b[2] ^ b[4], b[2], b[3]);
+	a[3] = bitselect(b[3] ^ b[0], b[3], b[4]);
+	if (out_size >= 4)
+	{
+		a[4] = bitselect(b[4] ^ b[1], b[4], b[0]);
+		a[5] = bitselect(b[5] ^ b[7], b[5], b[6]);
+		a[6] = bitselect(b[6] ^ b[8], b[6], b[7]);
+		a[7] = bitselect(b[7] ^ b[9], b[7], b[8]);
+		a[8] = bitselect(b[8] ^ b[5], b[8], b[9]);
+		if (out_size >= 8)
+		{
+			a[9] = bitselect(b[9] ^ b[6], b[9], b[5]);
+			a[10] = bitselect(b[10] ^ b[12], b[10], b[11]);
+			a[11] = bitselect(b[11] ^ b[13], b[11], b[12]);
+			a[12] = bitselect(b[12] ^ b[14], b[12], b[13]);
+			a[13] = bitselect(b[13] ^ b[10], b[13], b[14]);
+			a[14] = bitselect(b[14] ^ b[11], b[14], b[10]);
+			a[15] = bitselect(b[15] ^ b[17], b[15], b[16]);
+			a[16] = bitselect(b[16] ^ b[18], b[16], b[17]);
+			a[17] = bitselect(b[17] ^ b[19], b[17], b[18]);
+			a[18] = bitselect(b[18] ^ b[15], b[18], b[19]);
+			a[19] = bitselect(b[19] ^ b[16], b[19], b[15]);
+			a[20] = bitselect(b[20] ^ b[22], b[20], b[21]);
+			a[21] = bitselect(b[21] ^ b[23], b[21], b[22]);
+			a[22] = bitselect(b[22] ^ b[24], b[22], b[23]);
+			a[23] = bitselect(b[23] ^ b[20], b[23], b[24]);
+			a[24] = bitselect(b[24] ^ b[21], b[24], b[20]);
+		}
+	}
+
+	// Iota
+	a[0] ^= Keccak_f1600_RC[r];
+
+   #if !__ENDIAN_LITTLE__
+	for (uint i = 0; i != 25; ++i)
+		a[i] = a[i].yx;
+   #endif
+}
+
+void keccak_f1600_no_absorb(ulong* a, uint in_size, uint out_size, uint isolate)
+{
+	for (uint i = in_size; i != 25; ++i)
+	{
+		a[i] = 0;
+	}
+#if __ENDIAN_LITTLE__
+	a[in_size] ^= 0x0000000000000001;
+	a[24-out_size*2] ^= 0x8000000000000000;
+#else
+	a[in_size] ^= 0x0100000000000000;
+	a[24-out_size*2] ^= 0x0000000000000080;
+#endif
+
+	// Originally I unrolled the first and last rounds to interface
+	// better with surrounding code, however I haven't done this
+	// without causing the AMD compiler to blow up the VGPR usage.
+	uint r = 0;
+	do
+	{
+		// This dynamic branch stops the AMD compiler unrolling the loop
+		// and additionally saves about 33% of the VGPRs, enough to gain another
+		// wavefront. Ideally we'd get 4 in flight, but 3 is the best I can
+		// massage out of the compiler. It doesn't really seem to matter how
+		// much we try and help the compiler save VGPRs because it seems to throw
+		// that information away, hence the implementation of keccak here
+		// doesn't bother.
+		if (isolate)
+		{
+			keccak_f1600_round((uint2*)a, r++, 25);
+		}
+	}
+	while (r < 23);
+
+	// final round optimised for digest size
+	keccak_f1600_round((uint2*)a, r++, out_size);
+}
+
+#define copy(dst, src, count) for (uint i = 0; i != count; ++i) { (dst)[i] = (src)[i]; }
+
+#define countof(x) (sizeof(x) / sizeof(x[0]))
+
+uint fnv(uint x, uint y)
+{
+	return x * FNV_PRIME ^ y;
+}
+
+uint4 fnv4(uint4 x, uint4 y)
+{
+	return x * FNV_PRIME ^ y;
+}
+
+uint fnv_reduce(uint4 v)
+{
+	return fnv(fnv(fnv(v.x, v.y), v.z), v.w);
+}
+
+typedef union
+{
+	ulong ulongs[32 / sizeof(ulong)];
+	uint uints[32 / sizeof(uint)];
+} hash32_t;
+
+typedef union
+{
+	ulong ulongs[64 / sizeof(ulong)];
+	uint4 uint4s[64 / sizeof(uint4)];
+} hash64_t;
+
+typedef union
+{
+	uint uints[128 / sizeof(uint)];
+	uint4 uint4s[128 / sizeof(uint4)];
+} hash128_t;
+
+hash64_t init_hash(__constant hash32_t const* header, ulong nonce, uint isolate)
+{
+	hash64_t init;
+	uint const init_size = countof(init.ulongs);
+	uint const hash_size = countof(header->ulongs);
+
+	// sha3_512(header .. nonce)
+	ulong state[25];
+	copy(state, header->ulongs, hash_size);
+	state[hash_size] = nonce;
+	keccak_f1600_no_absorb(state, hash_size + 1, init_size, isolate);
+
+	copy(init.ulongs, state, init_size);
+	return init;
+}
+
+uint inner_loop_chunks(uint4 init, uint thread_id, __local uint* share, __global hash128_t const* g_dag, __global hash128_t const* g_dag1, __global hash128_t const* g_dag2, __global hash128_t const* g_dag3, uint isolate)
+{
+	uint4 mix = init;
+
+	// share init0
+	if (thread_id == 0)
+		*share = mix.x;
+	barrier(CLK_LOCAL_MEM_FENCE);
+	uint init0 = *share;
+
+	uint a = 0;
+	do
+	{
+		bool update_share = thread_id == (a/4) % THREADS_PER_HASH;
+
+		#pragma unroll
+		for (uint i = 0; i != 4; ++i)
+		{
+			if (update_share)
+			{
+				uint m[4] = { mix.x, mix.y, mix.z, mix.w };
+				*share = fnv(init0 ^ (a+i), m[i]) % DAG_SIZE;
+			}
+			barrier(CLK_LOCAL_MEM_FENCE);
+
+			mix = fnv4(mix, *share>=3 * DAG_SIZE / 4 ? g_dag3[*share - 3 * DAG_SIZE / 4].uint4s[thread_id] : *share>=DAG_SIZE / 2 ? g_dag2[*share - DAG_SIZE / 2].uint4s[thread_id] : *share>=DAG_SIZE / 4 ? g_dag1[*share - DAG_SIZE / 4].uint4s[thread_id]:g_dag[*share].uint4s[thread_id]);
+		}
+	} while ((a += 4) != (ACCESSES & isolate));
+
+	return fnv_reduce(mix);
+}
+
+
+
+uint inner_loop(uint4 init, uint thread_id, __local uint* share, __global hash128_t const* g_dag, uint isolate)
+{
+	uint4 mix = init;
+
+	// share init0
+	if (thread_id == 0)
+		*share = mix.x;
+	barrier(CLK_LOCAL_MEM_FENCE);
+	uint init0 = *share;
+
+	uint a = 0;
+	do
+	{
+		bool update_share = thread_id == (a/4) % THREADS_PER_HASH;
+
+		#pragma unroll
+		for (uint i = 0; i != 4; ++i)
+		{
+			if (update_share)
+			{
+				uint m[4] = { mix.x, mix.y, mix.z, mix.w };
+				*share = fnv(init0 ^ (a+i), m[i]) % DAG_SIZE;
+			}
+			barrier(CLK_LOCAL_MEM_FENCE);
+
+			mix = fnv4(mix, g_dag[*share].uint4s[thread_id]);
+		}
+	}
+	while ((a += 4) != (ACCESSES & isolate));
+
+	return fnv_reduce(mix);
+}
+
+
+hash32_t final_hash(hash64_t const* init, hash32_t const* mix, uint isolate)
+{
+	ulong state[25];
+
+	hash32_t hash;
+	uint const hash_size = countof(hash.ulongs);
+	uint const init_size = countof(init->ulongs);
+	uint const mix_size = countof(mix->ulongs);
+
+	// keccak_256(keccak_512(header..nonce) .. mix);
+	copy(state, init->ulongs, init_size);
+	copy(state + init_size, mix->ulongs, mix_size);
+	keccak_f1600_no_absorb(state, init_size+mix_size, hash_size, isolate);
+
+	// copy out
+	copy(hash.ulongs, state, hash_size);
+	return hash;
+}
+
+hash32_t compute_hash_simple(
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	ulong nonce,
+	uint isolate
+	)
+{
+	hash64_t init = init_hash(g_header, nonce, isolate);
+
+	hash128_t mix;
+	for (uint i = 0; i != countof(mix.uint4s); ++i)
+	{
+		mix.uint4s[i] = init.uint4s[i % countof(init.uint4s)];
+	}
+
+	uint mix_val = mix.uints[0];
+	uint init0 = mix.uints[0];
+	uint a = 0;
+	do
+	{
+		uint pi = fnv(init0 ^ a, mix_val) % DAG_SIZE;
+		uint n = (a+1) % countof(mix.uints);
+
+		#pragma unroll
+		for (uint i = 0; i != countof(mix.uints); ++i)
+		{
+			mix.uints[i] = fnv(mix.uints[i], g_dag[pi].uints[i]);
+			mix_val = i == n ? mix.uints[i] : mix_val;
+		}
+	}
+	while (++a != (ACCESSES & isolate));
+
+	// reduce to output
+	hash32_t fnv_mix;
+	for (uint i = 0; i != countof(fnv_mix.uints); ++i)
+	{
+		fnv_mix.uints[i] = fnv_reduce(mix.uint4s[i]);
+	}
+
+	return final_hash(&init, &fnv_mix, isolate);
+}
+
+typedef union
+{
+	struct
+	{
+		hash64_t init;
+		uint pad; // avoid lds bank conflicts
+	};
+	hash32_t mix;
+} compute_hash_share;
+
+
+hash32_t compute_hash(
+	__local compute_hash_share* share,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	ulong nonce,
+	uint isolate
+	)
+{
+	uint const gid = get_global_id(0);
+
+	// Compute one init hash per work item.
+	hash64_t init = init_hash(g_header, nonce, isolate);
+
+	// Threads work together in this phase in groups of 8.
+	uint const thread_id = gid % THREADS_PER_HASH;
+	uint const hash_id = (gid % GROUP_SIZE) / THREADS_PER_HASH;
+
+	hash32_t mix;
+	uint i = 0;
+	do
+	{
+		// share init with other threads
+		if (i == thread_id)
+			share[hash_id].init = init;
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		uint4 thread_init = share[hash_id].init.uint4s[thread_id % (64 / sizeof(uint4))];
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		uint thread_mix = inner_loop(thread_init, thread_id, share[hash_id].mix.uints, g_dag, isolate);
+
+		share[hash_id].mix.uints[thread_id] = thread_mix;
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		if (i == thread_id)
+			mix = share[hash_id].mix;
+		barrier(CLK_LOCAL_MEM_FENCE);
+	}
+	while (++i != (THREADS_PER_HASH & isolate));
+
+	return final_hash(&init, &mix, isolate);
+}
+
+
+hash32_t compute_hash_chunks(
+	__local compute_hash_share* share,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	__global hash128_t const* g_dag1,
+	__global hash128_t const* g_dag2,
+	__global hash128_t const* g_dag3,
+	ulong nonce,
+	uint isolate
+	)
+{
+	uint const gid = get_global_id(0);
+
+	// Compute one init hash per work item.
+	hash64_t init = init_hash(g_header, nonce, isolate);
+
+	// Threads work together in this phase in groups of 8.
+	uint const thread_id = gid % THREADS_PER_HASH;
+	uint const hash_id = (gid % GROUP_SIZE) / THREADS_PER_HASH;
+
+	hash32_t mix;
+	uint i = 0;
+	do
+	{
+		// share init with other threads
+		if (i == thread_id)
+			share[hash_id].init = init;
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		uint4 thread_init = share[hash_id].init.uint4s[thread_id % (64 / sizeof(uint4))];
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		uint thread_mix = inner_loop_chunks(thread_init, thread_id, share[hash_id].mix.uints, g_dag, g_dag1, g_dag2, g_dag3, isolate);
+
+		share[hash_id].mix.uints[thread_id] = thread_mix;
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		if (i == thread_id)
+			mix = share[hash_id].mix;
+		barrier(CLK_LOCAL_MEM_FENCE);
+	}
+	while (++i != (THREADS_PER_HASH & isolate));
+
+	return final_hash(&init, &mix, isolate);
+}
+
+__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
+__kernel void ethash_hash_simple(
+	__global hash32_t* g_hashes,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	ulong start_nonce,
+	uint isolate
+	)
+{
+	uint const gid = get_global_id(0);
+	g_hashes[gid] = compute_hash_simple(g_header, g_dag, start_nonce + gid, isolate);
+}
+
+__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
+__kernel void ethash_search_simple(
+	__global volatile uint* restrict g_output,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	ulong start_nonce,
+	ulong target,
+	uint isolate
+	)
+{
+	uint const gid = get_global_id(0);
+	hash32_t hash = compute_hash_simple(g_header, g_dag, start_nonce + gid, isolate);
+
+	if (hash.ulongs[countof(hash.ulongs)-1] < target)
+	{
+		uint slot = min(convert_uint(MAX_OUTPUTS), convert_uint(atomic_inc(&g_output[0]) + 1));
+		g_output[slot] = gid;
+	}
+}
+
+
+__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
+__kernel void ethash_hash(
+	__global hash32_t* g_hashes,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	ulong start_nonce,
+	uint isolate
+	)
+{
+	__local compute_hash_share share[HASHES_PER_LOOP];
+
+	uint const gid = get_global_id(0);
+	g_hashes[gid] = compute_hash(share, g_header, g_dag, start_nonce + gid, isolate);
+}
+
+__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
+__kernel void ethash_search(
+	__global volatile uint* restrict g_output,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	ulong start_nonce,
+	ulong target,
+	uint isolate
+	)
+{
+	__local compute_hash_share share[HASHES_PER_LOOP];
+
+	uint const gid = get_global_id(0);
+	hash32_t hash = compute_hash(share, g_header, g_dag, start_nonce + gid, isolate);
+
+	if (as_ulong(as_uchar8(hash.ulongs[0]).s76543210) < target)
+	{
+		uint slot = min((uint)MAX_OUTPUTS, atomic_inc(&g_output[0]) + 1);
+		g_output[slot] = gid;
+	}
+}
+
+__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
+__kernel void ethash_hash_chunks(
+	__global hash32_t* g_hashes,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	__global hash128_t const* g_dag1,
+	__global hash128_t const* g_dag2,
+	__global hash128_t const* g_dag3,
+	ulong start_nonce,
+	uint isolate
+	)
+{
+	__local compute_hash_share share[HASHES_PER_LOOP];
+
+	uint const gid = get_global_id(0);
+	g_hashes[gid] = compute_hash_chunks(share, g_header, g_dag, g_dag1, g_dag2, g_dag3,start_nonce + gid, isolate);
+}
+
+__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
+__kernel void ethash_search_chunks(
+	__global volatile uint* restrict g_output,
+	__constant hash32_t const* g_header,
+	__global hash128_t const* g_dag,
+	__global hash128_t const* g_dag1,
+	__global hash128_t const* g_dag2,
+	__global hash128_t const* g_dag3,
+	ulong start_nonce,
+	ulong target,
+	uint isolate
+	)
+{
+	__local compute_hash_share share[HASHES_PER_LOOP];
+
+	uint const gid = get_global_id(0);
+	hash32_t hash = compute_hash_chunks(share, g_header, g_dag, g_dag1, g_dag2, g_dag3, start_nonce + gid, isolate);
+
+	if (as_ulong(as_uchar8(hash.ulongs[0]).s76543210) < target)
+	{
+		uint slot = min(convert_uint(MAX_OUTPUTS), convert_uint(atomic_inc(&g_output[0]) + 1));
+		g_output[slot] = gid;
+	}
+}
+`
--- a/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_test.go
+++ b/Godeps/_workspace/src/github.com/ethereum/ethash/ethash_test.go
@ -92,7 +92,7 @@ func TestEthashConcurrentVerify(t *testing.T) {
 	defer os.RemoveAll(eth.Full.Dir)

 	block := &testBlock{difficulty: big.NewInt(10)}
-	nonce, md := eth.Search(block, nil)
+	nonce, md := eth.Search(block, nil, 0)
 	block.nonce = nonce
 	block.mixDigest = common.BytesToHash(md)

@ -135,7 +135,7 @@ func TestEthashConcurrentSearch(t *testing.T) {
 	// launch n searches concurrently.
 	for i := 0; i < nsearch; i++ {
 		go func() {
-			nonce, md := eth.Search(block, stop)
+			nonce, md := eth.Search(block, stop, 0)
 			select {
 			case found <- searchRes{n: nonce, md: md}:
 			case <-stop:
@ -167,7 +167,7 @@ func TestEthashSearchAcrossEpoch(t *testing.T) {
 	for i := epochLength - 40; i < epochLength+40; i++ {
 		block := &testBlock{number: i, difficulty: big.NewInt(90)}
 		rand.Read(block.hashNoNonce[:])
-		nonce, md := eth.Search(block, nil)
+		nonce, md := eth.Search(block, nil, 0)
 		block.nonce = nonce
 		block.mixDigest = common.BytesToHash(md)
 		if !eth.Verify(block) {
--- a/6
+++ b/6
@ -6,7 +6,7 @@
 GOBIN = build/bin

 geth:
-	build/env.sh go install -v $(shell build/ldflags.sh) ./cmd/geth
+	build/env.sh go install -v $(shell build/flags.sh) ./cmd/geth
 	@echo "Done building."
 	@echo "Run \"$(GOBIN)/geth\" to launch geth."

@ -39,12 +39,12 @@ evm:
 	@echo "Done building."
 	@echo "Run \"$(GOBIN)/evm to start the evm."
 mist:
-	build/env.sh go install -v $(shell build/ldflags.sh) ./cmd/mist
+	build/env.sh go install -v $(shell build/flags.sh) ./cmd/mist
 	@echo "Done building."
 	@echo "Run \"$(GOBIN)/mist --asset_path=cmd/mist/assets\" to launch mist."

 all:
-	build/env.sh go install -v $(shell build/ldflags.sh) ./...
+	build/env.sh go install -v $(shell build/flags.sh) ./...

 test: all
 	build/env.sh go test ./...
--- a/build/ldflags.sh
+++ b/build/ldflags.sh
@ -16,3 +16,7 @@ sep=$(go version | awk '{ if ($3 >= "go1.5" || index($3, "devel")) print "="; el
 if [ -f ".git/HEAD" ]; then
    echo "-ldflags '-X main.gitCommit$sep$(git rev-parse HEAD)'"
 fi
+
+if [ ! -z "$GO_OPENCL" ]; then
+   echo "-tags opencl"
+fi
--- a/cmd/geth/js_test.go
+++ b/cmd/geth/js_test.go
@ -468,8 +468,7 @@ func processTxs(repl *testjethre, t *testing.T, expTxc int) bool {
 		t.Errorf("incorrect number of pending transactions, expected %v, got %v", expTxc, txc)
 		return false
 	}
-
-	err = repl.ethereum.StartMining(runtime.NumCPU())
+	err = repl.ethereum.StartMining(runtime.NumCPU(), "")
 	if err != nil {
 		t.Errorf("unexpected error mining: %v", err)
 		return false
--- a/cmd/geth/main.go
+++ b/cmd/geth/main.go
@ -104,6 +104,22 @@ The makedag command generates an ethash DAG in /tmp/dag.

 This command exists to support the system testing project.
 Regular users do not need to execute it.
+`,
+		},
+		{
+			Action: gpuinfo,
+			Name:   "gpuinfo",
+			Usage:  "gpuinfo",
+			Description: `
+Prints OpenCL device info for all found GPUs.
+`,
+		},
+		{
+			Action: gpubench,
+			Name:   "gpubench",
+			Usage:  "benchmark GPU",
+			Description: `
+Runs quick benchmark on first GPU found.
 `,
 		},
 		{
@ -298,6 +314,7 @@ JavaScript API. See https://github.com/ethereum/go-ethereum/wiki/Javascipt-Conso
 		utils.GasPriceFlag,
 		utils.MinerThreadsFlag,
 		utils.MiningEnabledFlag,
+		utils.MiningGPUFlag,
 		utils.AutoDAGFlag,
 		utils.NATFlag,
 		utils.NatspecEnabledFlag,
@ -586,7 +603,10 @@ func startEth(ctx *cli.Context, eth *eth.Ethereum) {
 		}
 	}
 	if ctx.GlobalBool(utils.MiningEnabledFlag.Name) {
-		if err := eth.StartMining(ctx.GlobalInt(utils.MinerThreadsFlag.Name)); err != nil {
+		err := eth.StartMining(
+			ctx.GlobalInt(utils.MinerThreadsFlag.Name),
+			ctx.GlobalString(utils.MiningGPUFlag.Name))
+		if err != nil {
 			utils.Fatalf("%v", err)
 		}
 	}
@ -740,6 +760,29 @@ func makedag(ctx *cli.Context) {
 	}
 }

+func gpuinfo(ctx *cli.Context) {
+	eth.PrintOpenCLDevices()
+}
+
+func gpubench(ctx *cli.Context) {
+	args := ctx.Args()
+	wrongArgs := func() {
+		utils.Fatalf(`Usage: geth gpubench <gpu number>`)
+	}
+	switch {
+	case len(args) == 1:
+		n, err := strconv.ParseUint(args[0], 0, 64)
+		if err != nil {
+			wrongArgs()
+		}
+		eth.GPUBench(n)
+	case len(args) == 0:
+		eth.GPUBench(0)
+	default:
+		wrongArgs()
+	}
+}
+
 func version(c *cli.Context) {
 	fmt.Println(ClientIdentifier)
 	fmt.Println("Version:", Version)
--- a/cmd/utils/flags.go
+++ b/cmd/utils/flags.go
@ -155,6 +155,12 @@ var (
 	}

 	// miner settings
+	// TODO: refactor CPU vs GPU mining flags
+	MiningGPUFlag = cli.StringFlag{
+		Name:  "minegpu",
+		Usage: "Mine with given GPUs. '--minegpu 0,1' will mine with the first two GPUs found.",
+	}
+
 	MinerThreadsFlag = cli.IntFlag{
 		Name:  "minerthreads",
 		Usage: "Number of miner threads",
--- a/common/natspec/natspec_e2e_test.go
+++ b/common/natspec/natspec_e2e_test.go
@ -306,7 +306,7 @@ func processTxs(repl *testFrontend, t *testing.T, expTxc int) bool {
 		return false
 	}

-	err = repl.ethereum.StartMining(runtime.NumCPU())
+	err = repl.ethereum.StartMining(runtime.NumCPU(), "")
 	if err != nil {
 		t.Errorf("unexpected error mining: %v", err)
 		return false
--- a/core/chain_makers.go
+++ b/core/chain_makers.go
@ -32,7 +32,7 @@ import (
 // It returns true from Verify for any block.
 type FakePow struct{}

-func (f FakePow) Search(block pow.Block, stop <-chan struct{}) (uint64, []byte) {
+func (f FakePow) Search(block pow.Block, stop <-chan struct{}, index int) (uint64, []byte) {
 	return 0, nil
 }
 func (f FakePow) Verify(block pow.Block) bool { return true }
--- a/core/chain_pow_test.go
+++ b/core/chain_pow_test.go
@ -34,7 +34,7 @@ type failPow struct {
 	failing uint64
 }

-func (pow failPow) Search(pow.Block, <-chan struct{}) (uint64, []byte) {
+func (pow failPow) Search(pow.Block, <-chan struct{}, int) (uint64, []byte) {
 	return 0, nil
 }
 func (pow failPow) Verify(block pow.Block) bool { return block.NumberU64() != pow.failing }
@ -47,7 +47,7 @@ type delayedPow struct {
 	delay time.Duration
 }

-func (pow delayedPow) Search(pow.Block, <-chan struct{}) (uint64, []byte) {
+func (pow delayedPow) Search(pow.Block, <-chan struct{}, int) (uint64, []byte) {
 	return 0, nil
 }
 func (pow delayedPow) Verify(block pow.Block) bool { time.Sleep(pow.delay); return true }
--- a/eth/backend.go
+++ b/eth/backend.go
@ -498,18 +498,6 @@ func (s *Ethereum) ResetWithGenesisBlock(gb *types.Block) {
 	s.blockchain.ResetWithGenesisBlock(gb)
 }

-func (s *Ethereum) StartMining(threads int) error {
-	eb, err := s.Etherbase()
-	if err != nil {
-		err = fmt.Errorf("Cannot start mining without etherbase address: %v", err)
-		glog.V(logger.Error).Infoln(err)
-		return err
-	}
-
-	go s.miner.Start(eb, threads)
-	return nil
-}
-
 func (s *Ethereum) Etherbase() (eb common.Address, err error) {
 	eb = s.etherbase
 	if (eb == common.Address{}) {
--- a/eth/cpu_mining.go
+++ b/eth/cpu_mining.go
@ -0,0 +1,54 @@
+// Copyright 2014 The go-ethereum Authors
+// This file is part of the go-ethereum library.
+//
+// The go-ethereum library is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// The go-ethereum library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
+
+// +build !opencl
+
+package eth
+
+import (
+	"errors"
+	"fmt"
+
+	"github.com/ethereum/go-ethereum/logger"
+	"github.com/ethereum/go-ethereum/logger/glog"
+)
+
+const disabledInfo = "Set GO_OPENCL and re-build to enable."
+
+func (s *Ethereum) StartMining(threads int, gpus string) error {
+	eb, err := s.Etherbase()
+	if err != nil {
+		err = fmt.Errorf("Cannot start mining without etherbase address: %v", err)
+		glog.V(logger.Error).Infoln(err)
+		return err
+	}
+
+	if gpus != "" {
+		return errors.New("GPU mining disabled. " + disabledInfo)
+	}
+
+	// CPU mining
+	go s.miner.Start(eb, threads)
+	return nil
+}
+
+func GPUBench(gpuid uint64) {
+	fmt.Println("GPU mining disabled. " + disabledInfo)
+}
+
+func PrintOpenCLDevices() {
+	fmt.Println("OpenCL disabled. " + disabledInfo)
+}
--- a/eth/gpu_mining.go
+++ b/eth/gpu_mining.go
@ -0,0 +1,103 @@
+// Copyright 2014 The go-ethereum Authors
+// This file is part of the go-ethereum library.
+//
+// The go-ethereum library is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// The go-ethereum library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
+
+// +build opencl
+
+package eth
+
+import (
+	"fmt"
+	"math/big"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/ethereum/ethash"
+	"github.com/ethereum/go-ethereum/common"
+	"github.com/ethereum/go-ethereum/core/types"
+	"github.com/ethereum/go-ethereum/logger"
+	"github.com/ethereum/go-ethereum/logger/glog"
+	"github.com/ethereum/go-ethereum/miner"
+)
+
+func (s *Ethereum) StartMining(threads int, gpus string) error {
+	eb, err := s.Etherbase()
+	if err != nil {
+		err = fmt.Errorf("Cannot start mining without etherbase address: %v", err)
+		glog.V(logger.Error).Infoln(err)
+		return err
+	}
+
+	// GPU mining
+	if gpus != "" {
+		var ids []int
+		for _, s := range strings.Split(gpus, ",") {
+			i, err := strconv.Atoi(s)
+			if err != nil {
+				return fmt.Errorf("Invalid GPU id(s): %v", err)
+			}
+			if i < 0 {
+				return fmt.Errorf("Invalid GPU id: %v", i)
+			}
+			ids = append(ids, i)
+		}
+
+		// TODO: re-creating miner is a bit ugly
+		cl := ethash.NewCL(ids)
+		s.miner = miner.New(s, s.EventMux(), cl)
+		go s.miner.Start(eb, len(ids))
+		return nil
+	}
+
+	// CPU mining
+	go s.miner.Start(eb, threads)
+	return nil
+}
+
+func GPUBench(gpuid uint64) {
+	e := ethash.NewCL([]int{int(gpuid)})
+
+	var h common.Hash
+	bogoHeader := &types.Header{
+		ParentHash: h,
+		Number:     big.NewInt(int64(42)),
+		Difficulty: big.NewInt(int64(999999999999999)),
+	}
+	bogoBlock := types.NewBlock(bogoHeader, nil, nil, nil)
+
+	err := ethash.InitCL(bogoBlock.NumberU64(), e)
+	if err != nil {
+		fmt.Println("OpenCL init error: ", err)
+		return
+	}
+
+	stopChan := make(chan struct{})
+	reportHashRate := func() {
+		for {
+			time.Sleep(3 * time.Second)
+			fmt.Printf("hashes/s : %v\n", e.GetHashrate())
+		}
+	}
+	fmt.Printf("Starting benchmark (%v seconds)\n", 60)
+	go reportHashRate()
+	go e.Search(bogoBlock, stopChan, 0)
+	time.Sleep(60 * time.Second)
+	fmt.Println("OK.")
+}
+
+func PrintOpenCLDevices() {
+	ethash.PrintDevices()
+}
--- a/miner/agent.go
+++ b/miner/agent.go
@ -118,7 +118,7 @@ func (self *CpuAgent) mine(work *Work, stop <-chan struct{}) {
 	glog.V(logger.Debug).Infof("(re)started agent[%d]. mining...\n", self.index)

 	// Mine
-	nonce, mixDigest := self.pow.Search(work.Block, stop)
+	nonce, mixDigest := self.pow.Search(work.Block, stop, self.index)
 	if nonce != 0 {
 		block := work.Block.WithMiningResult(nonce, common.BytesToHash(mixDigest))
 		self.returnCh <- &Result{work, block}
--- a/pow/ezp/pow.go
+++ b/pow/ezp/pow.go
@ -48,7 +48,7 @@ func (pow *EasyPow) Turbo(on bool) {
 	pow.turbo = on
 }

-func (pow *EasyPow) Search(block pow.Block, stop <-chan struct{}) (uint64, []byte) {
+func (pow *EasyPow) Search(block pow.Block, stop <-chan struct{}, index int) (uint64, []byte) {
 	r := rand.New(rand.NewSource(time.Now().UnixNano()))
 	hash := block.HashNoNonce()
 	diff := block.Difficulty()
--- a/pow/pow.go
+++ b/pow/pow.go
@ -17,7 +17,7 @@
 package pow

 type PoW interface {
-	Search(block Block, stop <-chan struct{}) (uint64, []byte)
+	Search(block Block, stop <-chan struct{}, index int) (uint64, []byte)
 	Verify(block Block) bool
 	GetHashrate() int64
 	Turbo(bool)
--- a/rpc/api/miner.go
+++ b/rpc/api/miner.go
@ -100,7 +100,7 @@ func (self *minerApi) StartMiner(req *shared.Request) (interface{}, error) {
 	}

 	self.ethereum.StartAutoDAG()
-	err := self.ethereum.StartMining(args.Threads)
+	err := self.ethereum.StartMining(args.Threads, "")
 	if err == nil {
 		return true, nil
 	}
--- a/xeth/xeth.go
+++ b/xeth/xeth.go
@ -474,7 +474,7 @@ func (self *XEth) ClientVersion() string {
 func (self *XEth) SetMining(shouldmine bool, threads int) bool {
 	ismining := self.backend.IsMining()
 	if shouldmine && !ismining {
-		err := self.backend.StartMining(threads)
+		err := self.backend.StartMining(threads, "")
 		return err == nil
 	}
 	if ismining && !shouldmine {