/* $NetBSD$ */

/*-
 * Copyright (c) 2009 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Cherry G. Mathew <cherry@zyx.in>
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * TODO:
xen.balloon
xen.balloon.current: DONE
xen.balloon.target: IN PROGRESS
xen.balloon.low-balloon
xen.balloon.high-balloon
xen.balloon.limit

 sysctl labels = { 'current'      : 'Current allocation',
           'target'       : 'Requested target',
           'low-balloon'  : 'Low-mem balloon',
           'high-balloon' : 'High-mem balloon',
           'limit'        : 'Xen hard limit' }

*/

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD$");

#include "opt_balloon.h"

#include <sys/param.h>

#include <sys/condvar.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/kthread.h>
#include <sys/mutex.h>
#include <sys/queue.h>
#include <sys/sysctl.h>

#include <xen/xen.h>
#include <xen/xenbus.h>
#include <xen/balloon.h>

#include <uvm/uvm.h>
#include <uvm/uvm_extern.h>
#include <xen/xenpmap.h>

#define BALLOONINTERVALMS 100 /* milliseconds */
/* XXX: fix limits */
#define LOW_BALLOON 100 /* In pages */
#define HIGH_BALLOON SIZE_T_MAX /* In pages */

/* Forward declaration */
static void xenbus_balloon_watcher(struct xenbus_watch *watch, const char **vec,
				   unsigned int len);

struct balloon_page_entry {
	struct vm_page *pg;
	SLIST_ENTRY(balloon_page_entry) entry;
};

static struct balloon_conf {
	kmutex_t flaglock; /* Protects condvar (below) */
	kcondvar_t cv_memchanged; /* Notifier flag for target (below) */

	kmutex_t tgtlock; /* Spin lock, protects .target, below */
	size_t target; /* Target balloon size, in pages. */

	SLIST_HEAD(, balloon_page_entry) balloon_page_entries;

} balloon_conf;

static struct xenbus_watch xenbus_balloon_watch = {
	.node = __UNCONST("memory/target"),
	.xbw_callback = xenbus_balloon_watcher,
};


/* Returns zero, on error */
static size_t
xenmem_get_maxreservation(void)
{
	int ret;
	ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, 
				   & (domid_t) { DOMID_SELF });

	if (ret < 0) {
		/* XXX: panic() ? */
		return 0;
	}

	return ret;

}

/* Returns zero, on error */
static size_t
xenmem_get_currentreservation(void)
{
	int ret;

	ret = HYPERVISOR_memory_op(XENMEM_current_reservation,
				   & (domid_t) { DOMID_SELF });

	if (ret < 0) {
		/* XXX: panic() ? */
		return 0;
	}

	return ret;

}

static size_t
balloon_get_target(void)
{
	size_t target;

	mutex_spin_enter(&balloon_conf.tgtlock);
	target = balloon_conf.target;
	mutex_spin_exit(&balloon_conf.tgtlock);

	return target;

}

static void
balloon_set_target(size_t target)
{

	mutex_spin_enter(&balloon_conf.tgtlock);
	balloon_conf.target = target;
	mutex_spin_exit(&balloon_conf.tgtlock);

	return;

}

static size_t
reserve_pages(size_t npages, xen_pfn_t *mfn_list)
{


	struct balloon_page_entry *bpg_entry;
	size_t newpgcount;
	paddr_t pfn;

	for (newpgcount = 0; newpgcount < npages; newpgcount++) {
		struct vm_page *pg;
		
		pg = uvm_pagealloc(NULL, 0, NULL,
				   UVM_PGA_ZERO);

		if (pg == NULL) {
			break;
		}

		pfn = x86_btop(VM_PAGE_TO_PHYS(pg) - XPMAP_OFFSET);
		mfn_list[newpgcount] = pfn_to_mfn(pfn);

		/* Invalidate pg */
		xpmap_phys_to_machine_mapping[pfn] = INVALID_P2M_ENTRY;

		/* Save mfn */
		/* 
		 * XXX: We don't keep a copy, but just save a pointer
		 * to the uvm pg handle. Is this ok ?
		 */

		bpg_entry = kmem_alloc(sizeof *bpg_entry, KM_SLEEP);

		if (bpg_entry == NULL) {
			uvm_pagefree(pg);
			break;
		}

		bpg_entry->pg = pg;

		SLIST_INSERT_HEAD(&balloon_conf.balloon_page_entries, 
				  bpg_entry, entry);
	}

	return newpgcount;
}

static size_t
unreserve_pages(size_t ret, xen_pfn_t *mfn_list)
{

	size_t npages;

	for (npages = 0; npages < ret; npages++) {
		struct balloon_page_entry *bpg_entry;
		struct vm_page *pg;
		paddr_t pfn;
		int tmp;

		if (SLIST_EMPTY(&balloon_conf.balloon_page_entries)) {
			/*XXX: This is the case where extra mem w.r.t boot comes in ? */
			printf("Balloon is empty. can't be collapsed further!");
			/*XXX: mark down target ? */
			break;
		}

		bpg_entry = SLIST_FIRST(&balloon_conf.balloon_page_entries);
		SLIST_REMOVE_HEAD(&balloon_conf.balloon_page_entries, entry);

		pg = bpg_entry->pg;

		kmem_free(bpg_entry, sizeof *bpg_entry);


		/* Update P->M */
		pfn = x86_btop(VM_PAGE_TO_PHYS(pg) - XPMAP_OFFSET);

		xpmap_phys_to_machine_mapping[pfn] = mfn_list[npages];


		/* Update the MMU */
		mmu_update_t mmu;
		mmu.ptr = x86_ptob(mfn_list[npages]) | MMU_MACHPHYS_UPDATE;
		mmu.val = pfn;

		if (HYPERVISOR_mmu_update(&mmu, 1, &tmp, DOMID_SELF) < 0) {
			panic("MMU Update failed!");
		}

		/* Free it to UVM */
		uvm_pagefree(pg);

	}

	return npages;
}

static void
balloon_inflate(size_t npages)
{


	int ret;
	size_t respgcnt;

	xen_pfn_t *mfn_list;

	struct xen_memory_reservation reservation = {
		.address_bits = 0,
		.extent_order = 0,
		.domid        = DOMID_SELF
	};

	/* 
	 * There's a risk that npages might overflow ret. 
	 * Do this is smaller steps then.
	 * See: HYPERVISOR_memory_op(...) below....
	 */

	if (npages > INT_MAX) {
		npages = INT_MAX;
	}

	mfn_list = kmem_alloc(npages * sizeof *mfn_list,
			      KM_SLEEP);

	if (mfn_list == NULL) {
		printf("%s: Error, could not allocate kernel memory",
		       __FILE__);
		return;
	}

	respgcnt = reserve_pages(npages, mfn_list);

	/* Hand over pages to Hypervisor */
	xenguest_handle(reservation.extent_start) = mfn_list;
	reservation.nr_extents = respgcnt;

	ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);

	if (ret < 0) {
		/* Unroll loop and release page frames back to the OS. */
		unreserve_pages(respgcnt, mfn_list);
		return;
	}

	KASSERT(ret == npages);

	kmem_free(mfn_list, npages * sizeof *mfn_list);
	printf("inflated by %d\n", ret);
	return;

}

static void
balloon_deflate(size_t npages)
{

	int ret; 
	size_t pgmax; 
	xen_pfn_t *mfn_list;

	struct xen_memory_reservation reservation = {
		.address_bits = 0,
		.extent_order = 0,
		.domid        = DOMID_SELF
	};

	/* 
	 * There's a risk that npages might overflow ret. 
	 * Do this is smaller steps then.
	 * See: HYPERVISOR_memory_op(...) below....
	 */

	if (npages > INT_MAX) {
		npages = INT_MAX;
	}

	/* XXX: This is wrong. npages is the _delta_. 
	 * Trim npages, if its exceeded the hard limit 
	 */
 	if ((pgmax = xenmem_get_maxreservation()) > 0) {
		pgmax -= xenmem_get_currentreservation(); 
	}

 	if (npages > pgmax && pgmax > 0) { 
 		npages = pgmax; 
 	} 

	mfn_list = kmem_alloc(npages * sizeof *mfn_list, KM_SLEEP);

	if (mfn_list == NULL) {
		printf("%s: Error, could not allocate kernel memory",
		       __FILE__);
		return;
	}

	xenguest_handle(reservation.extent_start) = mfn_list;
	reservation.nr_extents = npages;

	ret = HYPERVISOR_memory_op(XENMEM_increase_reservation, &reservation);

	if (ret <= 0) {
		panic("Increase reservation failed");
		/* NOTREACHED */
		return;
	}

	npages = unreserve_pages(ret, mfn_list);

	printf("deflated by %zu\n", npages);

	return;

}

static size_t
balloon_resize(size_t targetpages)
{

	size_t currentpages;

	/* Get current number of pages */
	currentpages = xenmem_get_currentreservation();

	KASSERT(currentpages > 0);

	if (targetpages == currentpages) {
		return currentpages;
	}

#if 0
	printf("Current pages == %zu\n", currentpages);
#endif

	/* Increase or decrease, accordingly */
	if (targetpages > currentpages) {
		balloon_deflate(targetpages - currentpages);
	}
	else {
		balloon_inflate(currentpages - targetpages);
	}

	/* Get the new, adjusted number of pages. */
	currentpages = xenmem_get_currentreservation();

	KASSERT(currentpages > 0);

	yield();

	return currentpages;
}

static void
balloon_thread(void *ignore)
{

	size_t targetinprogress;
	const int pollticks = mstohz(BALLOONINTERVALMS);

	/* 
	 * Get target. This will ensure that the wait loop (below)
	 * won't break out until the target is set properly for the
	 * first time. The value of targetinprogress is probably
	 * rubbish.
	 */
	targetinprogress = balloon_get_target();

	for/*ever*/ ( ;; ) {

		size_t tgtcache;

		mutex_enter(&balloon_conf.flaglock);

		while (balloon_get_target() == targetinprogress) {
			cv_timedwait(&balloon_conf.cv_memchanged, &balloon_conf.flaglock, pollticks);
		}
		tgtcache = balloon_get_target();

#if 0
		printf("new target ==> %zu\n", tgtcache);
#endif
		targetinprogress = balloon_resize(tgtcache);

		mutex_exit(&balloon_conf.flaglock);

	}

}

static size_t
xenbus_balloon_read_target(void)
{
	unsigned long long new_target;

	if (0 != xenbus_read_ull(NULL, "memory", "target", &new_target, 0)) {
		printf("error, couldn't read\n");
		return 0;
	}

	/* Convert to npages */

	return new_target * 1024 / PAGE_SIZE;
}

static void
xenbus_balloon_watcher(struct xenbus_watch *watch, const char **vec,
		       unsigned int len)
{
	unsigned long long new_target;

	if (0 == (new_target = xenbus_balloon_read_target())) {
		/* Don't update target value */
		return;
	}
	balloon_set_target(new_target);
	printf("Setting target to %llu\n", new_target);
	printf("Current reservation is %zu\n", xenmem_get_currentreservation());

	/* Notify balloon thread, if we can. */
	if (mutex_tryenter(&balloon_conf.flaglock)) {
		cv_signal(&balloon_conf.cv_memchanged);
		mutex_exit(&balloon_conf.flaglock);
	}
	
	return;
}

void
balloon_xenbus_setup(void)
{

#ifdef XEN_BALLOON

	/* Setup flaglocks, condvars et. al */
	mutex_init(&balloon_conf.flaglock, MUTEX_DEFAULT, IPL_NONE);
	mutex_init(&balloon_conf.tgtlock, MUTEX_DEFAULT, IPL_HIGH);
	cv_init(&balloon_conf.cv_memchanged, "ballooning");

	SLIST_INIT(&balloon_conf.balloon_page_entries);

	/* Setup xenbus node watch callback */
	if (register_xenbus_watch(&xenbus_balloon_watch)) {
		aprint_error("%s: unable to watch memory/target\n", __func__);
		cv_destroy(&balloon_conf.cv_memchanged);
		mutex_destroy(&balloon_conf.tgtlock);
		mutex_destroy(&balloon_conf.flaglock);
	}

	/* Setup kernel thread to asynchronously (in/de)-flate the balloon */
	if (kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, balloon_thread, NULL /* arg */,
			   NULL, "balloon")) {
		aprint_error("%s: unable to create balloon thread\n", __func__);
		unregister_xenbus_watch(&xenbus_balloon_watch);
		cv_destroy(&balloon_conf.cv_memchanged);
		mutex_destroy(&balloon_conf.tgtlock);
		mutex_destroy(&balloon_conf.flaglock);
	}

#endif
	return;

}


/* 
 * sysctl(9) stuff 
 */

/* sysctl helper routine */
static int
sysctl_kern_xen_balloon(SYSCTLFN_ARGS)
{

	struct sysctlnode node;

	/* 
	 * Assumes SIZE_T_MAX <= ((uint64_t) -1) see createv() in
	 * SYSCTL_SETUP(...) below
	 */

	int error;
	int64_t node_val;
	int64_t newnode_val;

	KASSERT(rnode != NULL);
	node = *rnode;

	if (strcmp(node.sysctl_name, "current") == 0) {
		node_val = xenmem_get_currentreservation();
		KASSERT(node_val < SIZE_T_MAX);
		node.sysctl_data = &node_val;
		return sysctl_lookup(SYSCTLFN_CALL(&node));

	} else if (strcmp(node.sysctl_name, "target") == 0) {

		newnode_val = node_val = balloon_get_target();
		node.sysctl_data = &newnode_val;
		error = sysctl_lookup(SYSCTLFN_CALL(&node));
		if (error != 0) {
			return error;
		}
			
		/* Sanity check new size */
/* 		if (newnode_val <= LOW_BALLOON */
/* 		    || newnode_val > HIGH_BALLOON) { */
/* 			return EINVAL; */
/* 		} */

		KASSERT(node_val < SIZE_T_MAX);

		if (node_val != newnode_val) {
//			* (int64_t *) rnode->sysctl_data = newnode_val;
			printf("setting to %qd", newnode_val);
			balloon_set_target(newnode_val);
		}

		return 0;
	}

	return EINVAL;
}

/* Setup nodes. */
SYSCTL_SETUP(sysctl_kern_xen_balloon_setup, "sysctl kern.xen.balloon setup")
{
	const struct sysctlnode *node = NULL;

	sysctl_createv(clog, 0, NULL, &node,
		       CTLFLAG_PERMANENT,
		       CTLTYPE_NODE, "kern", NULL,
		       NULL, 0, NULL, 0,
		       CTL_KERN, CTL_EOL);

	/* XXX: if (node != NULL) */
	sysctl_createv(clog, 0, &node, &node,
		       CTLFLAG_PERMANENT,
		       CTLTYPE_NODE, "xen",
		       SYSCTL_DESCR("Xen"),
		       NULL, 0, NULL, 0,
		       CTL_CREATE, CTL_EOL);

	sysctl_createv(clog, 0, &node, &node,
		       CTLFLAG_PERMANENT,
		       CTLTYPE_NODE, "balloon",
		       SYSCTL_DESCR("Balloon"),
		       NULL, 0, NULL, 0,
		       CTL_CREATE, CTL_EOL);

	sysctl_createv(clog, 0, &node, NULL,
		       CTLFLAG_PERMANENT,
		       CTLTYPE_QUAD, "current",
		       SYSCTL_DESCR("Current balloon size"),
		       sysctl_kern_xen_balloon, 0, NULL, 0,
		       CTL_CREATE, CTL_EOL);

	sysctl_createv(clog, 0, &node, NULL,
		       CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
		       CTLTYPE_QUAD, "target",
		       SYSCTL_DESCR("Target balloon size"),
		       sysctl_kern_xen_balloon, 0, NULL, 0,
		       CTL_CREATE, CTL_EOL);

}
