/*
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/device.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/sysdev.h>
#include <asm/mach/time.h>
#include <asm/uaccess.h>
#include <linux/proc_fs.h>

#include "mvXor.h"

#undef DEBUG
//#define DEBUG 

#ifdef DEBUG
	#define DPRINTK(s, args...)  printk("MV_XOR: " s, ## args)
#else
	#define DPRINTK(s, args...)
#endif

#ifdef CONFIG_USE_TWO_ENGINES
#define XOR_MAX_CHANNELS    2
#else
#define XOR_MAX_CHANNELS    1
#endif

#define XOR_TIMEOUT 0x8000000
struct xor_channel_t
{
    MV_XOR_DESC     *pDescriptor;
    dma_addr_t      descPhyAddr;
    wait_queue_head_t waitq;
    struct semaphore  sema;
#ifdef CONFIG_ENABLE_XOR_INTERRUPTS
    int             irq_num;
    const char      *name;
#endif
    int chan_num;
    int	chan_active;
};

struct xor_channel_t  xor_channel[XOR_MAX_CHANNELS];

struct semaphore  meminit_sema;
int xor_engine_initialized = 0;

MV_XOR_DESC     *pDescriptors;
dma_addr_t      descsPhyAddr;
#define XOR_MIN_COPY_CHUNK 128
static unsigned long mv_dma_min_buffer = IDMA_MIN_COPY;

static struct proc_dir_entry *xor_read_proc_entry;
static int xor_dma_hit = 0, xor_dma_miss = 0, xor_dma_unaligned = 0, xor_hit = 0;
static int xor_memzero_hit = 0, xor_memzero_miss = 0, xor_memzero_unaligned = 0;
static int dma_to_user = 0;
static int dma_from_user = 0;
#define RT_DEBUG
#ifdef RT_DEBUG
static int dma_activations = 0;
#endif
#ifdef CONFIG_MV_USE_XOR_FOR_COPY_USER_BUFFERS
static unsigned long dma_copy(void *to, const void *from, unsigned long n, unsigned int to_user);
#endif
static inline u32 page_remainder(u32 virt)
{
	return PAGE_SIZE - (virt & ~PAGE_MASK);
}

/*
 * map a kernel virtual address or kernel logical address to a phys address
 */
static inline u32 physical_address(u32 virt, int write)
{
    struct page *page;
       /* kernel static-mapped address */
    DPRINTK(" get physical address: virt %x , write %d\n", virt, write);
    if (virt_addr_valid(virt)) 
    {
        return __pa((u32) virt);
    }
    if (virt >= TASK_SIZE)
    {
        page = follow_page(&init_mm, (u32) virt, write);
    }
    else
    {
        page = follow_page(current->mm, (u32) virt, write);
    }
    
    if (pfn_valid(page_to_pfn(page)))
    {
        return ((page_to_pfn(page) << PAGE_SHIFT) |
                       ((u32) virt & (PAGE_SIZE - 1)));
    }
    else
    {
        return 0;
    }
}


int allocate_channel(void)
{
    int chan;
    for(chan = 0; chan < XOR_MAX_CHANNELS; chan++)
    {        
        if(down_trylock(&xor_channel[chan].sema))
        {
            DPRINTK("XOR engine %d is busy\n", chan);
            continue;
        }
	if(mvXorStateGet(chan) != MV_IDLE) {
		printk("ERR: %s XOR chan %d is not idle",__FUNCTION__, chan);
	}
        return chan;
    }
    DPRINTK("XOR engines are busy, return\n");
    return -1;
}
void inline free_channel(struct xor_channel_t *channel)
{
	if(mvXorStateGet(channel->chan_num) != MV_IDLE){
		printk("ERR: %s XOR chan %d is not idle",__FUNCTION__, channel->chan_num);
		BUG();
	}
    up(&channel->sema);
}

#define XOR_CAUSE_DONE_MASK(chan) ((BIT0|BIT1) << (chan * 16) )
void xor_waiton_eng(int chan)
{
    int timeout = 0;
    if(!xor_channel[chan].chan_active)
	return;
    
    while(!(MV_REG_READ(XOR_CAUSE_REG) && XOR_CAUSE_DONE_MASK(chan))) 
    {
	if(timeout > XOR_TIMEOUT)
	    goto timeout; 
	timeout++;
    }

    timeout = 0;
    while(mvXorStateGet(chan) != MV_IDLE)
    {
	if(timeout > XOR_TIMEOUT)
	    goto timeout; 
	timeout++;
    }
    /* Clear int */
    MV_REG_WRITE(XOR_CAUSE_REG, ~(XOR_CAUSE_DONE_MASK(chan)));
    xor_channel[chan].chan_active = 0;

timeout:
    if(timeout > XOR_TIMEOUT)
    {
	printk("ERR: XOR eng got timedout!!\n");
	BUG();
    }
    return;

}

void
print_xor_regs(int chan)
{
    printk(" XOR_CHANNEL_ARBITER_REG %08x\n", MV_REG_READ(XOR_CHANNEL_ARBITER_REG));
    printk(" XOR_CONFIG_REG %08x\n", MV_REG_READ(XOR_CONFIG_REG(chan)));
    printk(" XOR_ACTIVATION_REG %08x\n", MV_REG_READ(XOR_ACTIVATION_REG(chan)));
    printk(" XOR_CAUSE_REG %08x\n", MV_REG_READ(XOR_CAUSE_REG));
    printk(" XOR_MASK_REG %08x\n", MV_REG_READ(XOR_MASK_REG));
    printk(" XOR_ERROR_CAUSE_REG %08x\n", MV_REG_READ(XOR_ERROR_CAUSE_REG));
    printk(" XOR_ERROR_ADDR_REG %08x\n", MV_REG_READ(XOR_ERROR_ADDR_REG));
    printk(" XOR_NEXT_DESC_PTR_REG %08x\n", MV_REG_READ(XOR_NEXT_DESC_PTR_REG(chan)));
    printk(" XOR_CURR_DESC_PTR_REG %08x\n", MV_REG_READ(XOR_CURR_DESC_PTR_REG(chan)));
    printk(" XOR_BYTE_COUNT_REG %08x\n", MV_REG_READ(XOR_BYTE_COUNT_REG(chan)));
}
#ifdef CONFIG_MV_RAID5_XOR_OFFLOAD
int xor_mv(unsigned int src_no, unsigned int bytes, void **bh_ptr)
{
	unsigned long *bptr = NULL;
	int i;
        u32      *srcAddr;
        int         chan;
        struct xor_channel_t *channel;

	if(src_no <= 1)
	{
		printk(KERN_ERR "%s: need more than 1 src for XOR\n",
			__func__);
		BUG();
                return bytes;
	}
        if (xor_engine_initialized == 0)
        {
            printk(KERN_WARNING" %s: xor engines not initialized yet\n", __func__);
            return bytes;
        }

        chan = allocate_channel();
        if ( chan == -1)
         {
                DPRINTK("XOR engines are busy, return\n");
                return bytes;
        }
	DPRINTK("setting up rest of descriptor for channel %d\n", chan);
        channel = &xor_channel[chan];
	// flush the cache to memory before XOR engine touches them
        srcAddr = &(channel->pDescriptor->srcAdd0);
	for(i = src_no-1; i >= 0; i--)
	{
		DPRINTK("flushing source %d\n", i);
		bptr = (unsigned long *)(bh_ptr[i]);
		/* Buffer 0 is also the destination */
		if(i==0)
			dmac_flush_range((unsigned long)bptr,
				 (unsigned long)bptr + bytes);			
		else
			dmac_clean_range((unsigned long)bptr,
				 (unsigned long)bptr + bytes);
                srcAddr[i] = virt_to_phys((unsigned long *)bh_ptr[i]);
	}

	channel->pDescriptor->phyDestAdd = 
			virt_to_phys((unsigned long *)bh_ptr[0]);
        channel->pDescriptor->byteCnt = bytes;
        channel->pDescriptor->phyNextDescPtr = 0;
        channel->pDescriptor->descCommand = (1 << src_no) - 1;
        channel->pDescriptor->status = BIT31;
	channel->chan_active = 1;
        if( mvXorTransfer(chan, MV_XOR, channel->descPhyAddr) != MV_OK )
        {
            printk(KERN_ERR "%s: XOR operation on channel %d failed!\n", __func__, chan);
            print_xor_regs(chan);
            BUG();
            free_channel(channel);
            return bytes;
        }
#ifdef CONFIG_ENABLE_XOR_INTERRUPTS
        wait_event(channel->waitq, (( channel->pDescriptor->status & BIT31) == 0));/*TODO add timeout*/
#else
        xor_waiton_eng(chan);
#endif
	DPRINTK("XOR complete\n");
#if 0
	if (!(channel->pDescriptor->status & BIT30)) {
	    printk(KERN_ERR "%s: XOR operation completed with error!\n", __func__);
            print_xor_regs(chan);            
	    BUG();
            free_channel(channel);
	    return MV_REG_READ(XOR_BYTE_COUNT_REG(chan));
        }
#endif
	DPRINTK("invalidate result in cache\n");
#if 0
	// invalidate the cache region to destination
        bptr = (unsigned long *)(bh_ptr[0]);
	dmac_inv_range((unsigned long)bptr,
		       (unsigned long)bptr + bytes);
#endif
        free_channel(channel);
        xor_hit++;
        return 0;
}
#endif
#ifdef CONFIG_MV_DMACOPY

/*=======================================================================*/
/*  Procedure:  dma_memcpy()                                             */
/*                                                                       */
/*  Description:    DMA-based in-kernel memcpy.                          */
/*                                                                       */
/*  Parameters:  to: destination address                                 */
/*               from: source address                                    */
/*               n: number of bytes to transfer                          */
/*                                                                       */
/*  Returns:     void*: to                                               */
/*                                                                       */
/*  Notes/Assumptions:                                                   */
/*              Assumes that kernel physical memory is contiguous, i.e., */
/*              the physical addresses of contiguous virtual addresses   */
/*              are also contiguous.                                     */
/*              Assumes that kernel memory doesn't get paged.            */
/*              The DMA is polling                                       */
/*                                                                       */
/*=======================================================================*/
void *dma_memcpy(void *to, const void *from, __kernel_size_t n)
{
	u32 xor_dma_unaligned_to, xor_dma_unaligned_from;
	void *orig_to = to;
	u32 to_pa, from_pa;
        int ua = 0;
        int         chan;
        struct xor_channel_t *channel;

	DPRINTK("dma_memcpy(0x%x, 0x%x, %lu): entering\n", (u32) to, (u32) from,
		(unsigned long)n);

        if (xor_engine_initialized == 0)
        {
            DPRINTK(KERN_WARNING" %s: xor engines not initialized yet\n", __func__);
       	    xor_dma_miss++;
	    return asm_memcpy(to, from, n);
        }
 	if (!(virt_addr_valid((u32) to) && virt_addr_valid((u32) from))) {
		DPRINTK("dma_memcpy(0x%x, 0x%x, %lu): falling back to memcpy\n",
			(u32) to, (u32) from, (unsigned long)n);
		xor_dma_miss++;
		return asm_memcpy(to, from, n);
	}

	/*
	 * We can only handled completely cache-aligned transactions
	 * with the DMA engine.  Source and Dst must be cache-line
	 * aligned AND the length must be a multiple of the cache-line.
	 */

	to_pa = virt_to_phys(to);
	from_pa = virt_to_phys((void*)from);

	if (((to_pa + n > from_pa) && (to_pa < from_pa)) ||
	    ((from_pa < to_pa) && (from_pa + n > to_pa))) {
		DPRINTK("overlapping copy region (0x%x, 0x%x, %lu), falling back\n",
		     to_pa, from_pa, (unsigned long)n);
		xor_dma_miss++;
		return asm_memcpy(to, from, n);
	}
	/*
	 * Ok, start addr is not cache line-aligned, so we need to make it so.
	 */
	xor_dma_unaligned_to = (u32) to & 31;
	xor_dma_unaligned_from = (u32) from & 31;;
	if (xor_dma_unaligned_to | xor_dma_unaligned_from) {
		ua++;
		if (xor_dma_unaligned_from > xor_dma_unaligned_to) {
			asm_memcpy(to, from, 32 - xor_dma_unaligned_to);
			to = (void *)((u32)to + 32 - xor_dma_unaligned_to);
			from = (void *)((u32)from + 32 - xor_dma_unaligned_to);
			n -= 32 - xor_dma_unaligned_to;
		} else {
			asm_memcpy(to, from, 32 - xor_dma_unaligned_from);
			to = (void *)((u32)to + 32 - xor_dma_unaligned_from);
			from = (void *)((u32)from + 32 - xor_dma_unaligned_from);
			n -= 32 - xor_dma_unaligned_from;
		}
	}

	/*
	 * Ok, we're aligned at the top, now let's check the end
	 * of the buffer and align that. After this we should have
	 * a block that is a multiple of cache line size.
	 */
	xor_dma_unaligned_to = ((u32) to + n) & 31;
	xor_dma_unaligned_from = ((u32) from + n) & 31;;
	if (xor_dma_unaligned_to | xor_dma_unaligned_from) {
		ua++;
		if (xor_dma_unaligned_to > xor_dma_unaligned_from) {
			u32 tmp_to = (u32) to + n - xor_dma_unaligned_to;
			u32 tmp_from = (u32) from + n - xor_dma_unaligned_to;

			asm_memcpy((void *)tmp_to, (void *)tmp_from,
				   xor_dma_unaligned_to);

			n -= xor_dma_unaligned_to;
		} else {
			u32 tmp_to = (u32) to + n - xor_dma_unaligned_from;
			u32 tmp_from = (u32) from + n - xor_dma_unaligned_from;

			asm_memcpy((void *)tmp_to, (void *)tmp_from,
				   xor_dma_unaligned_from);

			n -= xor_dma_unaligned_from;
		}
	}

	/*
	 * OK! We should now be fully aligned on both ends. 
	 */
        chan = allocate_channel();
        if ( chan == -1)
        {
                DPRINTK("XOR engines are busy, return\n");
       		xor_dma_miss++;
		return asm_memcpy(to, from, n);
        }
	DPRINTK("setting up rest of descriptor for channel %d\n", chan);
        channel = &xor_channel[chan];
	
        /* Ensure that the cache is clean */
	dmac_clean_range((unsigned long)from, (unsigned long)from + n);
	dmac_inv_range((unsigned long)to, (unsigned long)to + n);

	DPRINTK("setting up rest of descriptor\n");
	// flush the cache to memory before XOR engine touches them
        channel->pDescriptor->srcAdd0 = virt_to_phys((void*)from);
	channel->pDescriptor->phyDestAdd = virt_to_phys(to);
        channel->pDescriptor->byteCnt = n;
        channel->pDescriptor->phyNextDescPtr = 0;
        channel->pDescriptor->status = BIT31;
	channel->chan_active = 1;

        if( mvXorTransfer(chan, MV_DMA, channel->descPhyAddr) != MV_OK)
        {
            printk(KERN_ERR "%s: DMA copy operation on channel %d failed!\n", __func__, chan);
            print_xor_regs(chan);
            BUG();
            free_channel(channel);
       	    return asm_memcpy(to, from, n);
        }
        xor_waiton_eng(chan);


        DPRINTK("DMA copy complete\n");
	// check to see if failed
#if 0
	if (!(channel->pDescriptor->status & BIT30))
        {
            printk(KERN_ERR "%s: DMA copy operation completed with error!\n", __func__);
            printk(" srcAdd %x DestAddr %x, count %x\n", channel->pDescriptor->srcAdd0,
                                                channel->pDescriptor->phyDestAdd, n); 
            print_xor_regs(chan);            
	    BUG();
            free_channel(channel);
       	    return asm_memcpy(to, from, n);
        }
#endif
        free_channel(channel);
 
	xor_dma_hit++;
	if (ua)
		xor_dma_unaligned++;

	return orig_to;
}
EXPORT_SYMBOL(dma_memcpy);
#endif
#ifdef CONFIG_MV_USE_XOR_FOR_COPY_USER_BUFFERS
/*=======================================================================*/
/*  Procedure:  dma_copy_to_user()                                       */
/*                                                                       */
/*  Description:    DMA-based copy_to_user.                              */
/*                                                                       */
/*  Parameters:  to: destination address                                 */
/*               from: source address                                    */
/*               n: number of bytes to transfer                          */
/*                                                                       */
/*  Returns:     unsigned long: number of bytes NOT copied               */
/*                                                                       */
/*  Notes/Assumptions:                                                   */
/*              Assumes that kernel physical memory is contiguous, i.e., */
/*              the physical addresses of contiguous virtual addresses   */
/*              are also contiguous.                                     */
/*              Assumes that kernel memory doesn't get paged.            */
/*              Assumes that to/from memory regions cannot overlap       */
/*                                                                       */
/*=======================================================================*/
unsigned long dma_copy_to_user(void *to, const void *from, unsigned long n)
{
	if(!xor_engine_initialized)
    		return __copy_to_user((void *)to, (void *)from, n);

     	dma_to_user++;
     	DPRINTK(KERN_CRIT "dma_copy_to_user(%#10x, 0x%#10x, %lu): entering\n", (u32) to, (u32) from, n);
    
        return  dma_copy(to, from, n, 1);
}
EXPORT_SYMBOL(dma_copy_to_user);
/*=======================================================================*/
/*  Procedure:  dma_copy_from_user()                                     */
/*                                                                       */
/*  Description:    DMA-based copy_from_user.                            */
/*                                                                       */
/*  Parameters:  to: destination address                                 */
/*               from: source address                                    */
/*               n: number of bytes to transfer                          */
/*                                                                       */
/*  Returns:     unsigned long: number of bytes NOT copied               */
/*                                                                       */
/*  Notes/Assumptions:                                                   */
/*              Assumes that kernel virtual memory is contiguous, i.e.,  */
/*              the physical addresses of contiguous virtual addresses   */
/*              are also contiguous.                                     */
/*              Assumes that kernel memory doesn't get paged.            */
/*              Assumes that to/from memory regions cannot overlap       */
/*              XXX this one doesn't quite work right yet                */
/*                                                                       */
/*=======================================================================*/
unsigned long dma_copy_from_user(void *to, const void *from, unsigned long n)
{
    if(!xor_engine_initialized)
	return __copy_from_user((void *)to, (void *)from, n);

    dma_from_user++;
    DPRINTK(KERN_CRIT "dma_copy_from_user(0x%x, 0x%x, %lu): entering\n", (u32) to, (u32) from, n);
    return  dma_copy(to, from, n, 0);
}

EXPORT_SYMBOL(dma_copy_from_user);
/*
 * n must be greater equal than 64.
 */
static unsigned long dma_copy(void *to, const void *from, unsigned long n, unsigned int to_user)
{
	u32 chunk,i;
	u32 k_chunk = 0;
	u32 u_chunk = 0;
	u32 phys_from, phys_to;
	
        unsigned long flags;
	u32 unaligned_to;
	u32 index = 0;
        u32 temp;

        unsigned long uaddr, kaddr;
        unsigned char kaddr_kernel_static = 0;
	int     chan1, chan2 = -1;
        int     current_channel;
        struct xor_channel_t *channel;
       
        DPRINTK("dma_copy: entering\n");

        chan1 = allocate_channel();
        if (chan1 != -1)
        {
            chan2 = allocate_channel();
            if(chan2 == -1)
            {
                free_channel(&xor_channel[chan1]);
            }
        }
        if((chan1 == -1) || (chan2 == -1))
        {
            goto exit_dma;
        }
        current_channel = chan1;
	/* 
      	 * The unaligned is taken care seperatly since the dst might be part of a cache line that is changed 
	 * by other process -> we must not invalidate this cache lines and we can't also flush it, since other 
	 * process (or the exception handler) might fetch the cache line before we copied it. 
	 */

	/*
	 * Ok, start addr is not cache line-aligned, so we need to make it so.
	 */
	unaligned_to = (u32)to & 31;
	if(unaligned_to)
	{
		DPRINTK("Fixing up starting address %d bytes\n", 32 - unaligned_to);

		if(to_user)
			__copy_to_user(to, from, 32 - unaligned_to);
		else
			__copy_from_user(to, from, 32 - unaligned_to);

		temp = (u32)to + (32 - unaligned_to);
		to = (void *)temp;
		temp = (u32)from + (32 - unaligned_to);
		from = (void *)temp;

                /*it's ok, n supposed to be greater than 32 bytes at this point*/
		n -= (32 - unaligned_to);
	}

	/*
	 * Ok, we're aligned at the top, now let's check the end
	 * of the buffer and align that. After this we should have
	 * a block that is a multiple of cache line size.
	 */
	unaligned_to = ((u32)to + n) & 31;
	if(unaligned_to)
	{	
		u32 tmp_to = (u32)to + (n - unaligned_to);
		u32 tmp_from = (u32)from + (n - unaligned_to);
		DPRINTK("Fixing ending alignment %d bytes\n", unaligned_to);

		if(to_user)
			__copy_to_user((void *)tmp_to, (void *)tmp_from, unaligned_to);
		else
			__copy_from_user((void *)tmp_to, (void *)tmp_from, unaligned_to);

                /*it's ok, n supposed to be greater than 32 bytes at this point*/
		n -= unaligned_to;
	}

        if(to_user)
        {
            uaddr = (unsigned long)to;  
            kaddr = (unsigned long)from;
        }
        else
        {
             uaddr = (unsigned long)from;
             kaddr = (unsigned long)to;
        }
        if(virt_addr_valid(kaddr))
        {
            kaddr_kernel_static = 1;
            k_chunk = n;
        }
         
        spin_lock_irqsave(&current->mm->page_table_lock, flags);
     
        i = 0;
	while(n > 0)
	{
	    if(k_chunk == 0)
	    {
                /* virtual address */
	        k_chunk = page_remainder((u32)kaddr);
		DPRINTK("kaddr reminder %d \n",k_chunk);
	    }

	    if(u_chunk == 0)
	    {
                u_chunk = page_remainder((u32)uaddr);
                DPRINTK("uaddr reminder %d \n", u_chunk);
            }
        
            chunk = ((u_chunk < k_chunk) ? u_chunk : k_chunk);
            if(n < chunk)
	    {
		chunk = n;
	    }

	    if(chunk == 0)
	    {
	    	break;
	    }
            phys_from = physical_address((u32)from, 0);
            phys_to = physical_address((u32)to, 1);
	    DPRINTK("choose chunk %d \n",chunk);
	    /*
	     *  Prepare the IDMA.
	     */
            if (chunk < XOR_MIN_COPY_CHUNK)
            {
                int last_chan = chan1;   
        	DPRINTK(" chunk %d too small , use memcpy \n",chunk);
        	
        	if(current_channel == chan1)
                {   
                    last_chan = chan2;
                }
                /* the "to" address might cross cache line boundary, so part of the line*/  
                /* may be subject to DMA, so we need to wait to last DMA engine to finish */
                if(index > 1)
                    xor_waiton_eng(last_chan);

                if(to_user)
	       	    __copy_to_user((void *)to, (void *)from, chunk);
	        else
	            __copy_from_user((void *)to, (void *)from, chunk);
            }
            else
	    {
		if ((!phys_from) || (!phys_to))
		{
		    /* The requested page isn't available, fall back to */
		    DPRINTK(" no physical address, fall back: from %p , to %p \n", from, to);
		    goto unlock_dma;
   
		}
		else
		{
		    /* 
		    * Ensure that the cache is clean:
		    *      - from range must be cleaned
		    *      - to range must be invalidated
		    */
		    mvOsCacheFlush(NULL, (void *)from, chunk);
		    mvOsCacheInvalidate(NULL, (void *)to, chunk);

		    if(index > 1)
		    {
			xor_waiton_eng(current_channel);
		    }
		    channel = &xor_channel[current_channel];
  
		    /* Start DMA */
		    DPRINTK(" activate DMA: channel %d from %x to %x len %x\n",
                            chan, phys_from, phys_to, chunk);
		    channel->pDescriptor->srcAdd0 = phys_from;
		    channel->pDescriptor->phyDestAdd = phys_to;
		    channel->pDescriptor->byteCnt = chunk;
		    channel->pDescriptor->phyNextDescPtr = 0;
		    channel->pDescriptor->status = BIT31;
		    channel->chan_active = 1;

		    if( mvXorTransfer(current_channel, MV_DMA, channel->descPhyAddr) != MV_OK)
		    {
			printk(KERN_ERR "%s: DMA copy operation on channel %d failed!\n", __func__, current_channel);
			print_xor_regs(current_channel);
			BUG();
		    }
                
		    if(current_channel == chan1) 
		    {
			current_channel = chan2;
                    }
		    else
		    {
			current_channel = chan1;
		    }
#ifdef RT_DEBUG
			dma_activations++;
#endif
			index++;
		    }
		}

		/* go to next chunk */
		from += chunk;
		to += chunk;
                kaddr += chunk;
                uaddr += chunk;
		n -= chunk;
		u_chunk -= chunk;
		k_chunk -= chunk;		
	}
unlock_dma:
        xor_waiton_eng(chan1);
        xor_waiton_eng(chan2);
        spin_unlock_irqrestore(&current->mm->page_table_lock, flags);
        free_channel(&xor_channel[chan1]);
        free_channel(&xor_channel[chan2]);

exit_dma:        
        DPRINTK("dma_copy(0x%x, 0x%x, %lu): exiting\n", (u32) to,
                (u32) from, n);
       
        if(n != 0)
        {
       	    if(to_user)
                return __copy_to_user((void *)to, (void *)from, n);
	            else
                return __copy_from_user((void *)to, (void *)from, n);
        }
        return 0;
}
#endif

#ifdef CONFIG_MV_DMAMEMZERO
/*=======================================================================*/
/*  Procedure:  dma_memzero()                                             */
/*                                                                       */
/*  Description:    DMA-based in-kernel memzero.                          */
/*                                                                       */
/*  Parameters:  to: destination address                                 */
/*               n: number of bytes to transfer                          */
/*                                                                       */
/*                                                                       */
/*  Notes/Assumptions:                                                   */
/*              Assumes that kernel physical memory is contiguous, i.e., */
/*              the physical addresses of contiguous virtual addresses   */
/*              are also contiguous.                                     */
/*              Assumes that kernel memory doesn't get paged.            */
/*              The DMA is polling                                       */
/*                                                                       */
/*=======================================================================*/
void dma_memzero(void *to, __kernel_size_t n)
{
	u32 xor_dma_unaligned_to;
	u32 to_pa;
        int ua = 0;
        int         chan;
        struct xor_channel_t *channel;

	DPRINTK("dma_memzero(0x%x, %lu): entering\n", (u32) to, (unsigned long)n);

        if (xor_engine_initialized == 0)
        {
            DPRINTK(KERN_WARNING" %s: xor engines not initialized yet\n", __func__);
       	    xor_memzero_miss++;
	    return asm_memzero(to, n);
        }
 	if (!(virt_addr_valid((u32) to))) {
		DPRINTK("dma_memcpy(0x%x, %lu): falling back to memzero\n",
			(u32) to, (unsigned long)n);
		xor_memzero_miss++;
		return asm_memzero(to, n);
	}

	/*
	 * We can only handled completely cache-aligned transactions
	 * with the DMA engine.  Dst must be cache-line
	 * aligned AND the length must be a multiple of the cache-line.
	 */

	to_pa = virt_to_phys(to);

	/*
	 * Ok, start addr is not cache line-aligned, so we need to make it so.
	 */
	xor_dma_unaligned_to = (u32) to & 31;
	if (xor_dma_unaligned_to)
        {
            ua++;
	    asm_memzero(to, 32 - xor_dma_unaligned_to);
            to = (void *)((u32)to + 32 - xor_dma_unaligned_to);
	    n -= 32 - xor_dma_unaligned_to;
	}

	/*
	 * Ok, we're aligned at the top, now let's check the end
	 * of the buffer and align that. After this we should have
	 * a block that is a multiple of cache line size.
	 */
	xor_dma_unaligned_to = ((u32) to + n) & 31;
	if (xor_dma_unaligned_to) {
	    u32 tmp_to = (u32) to + n - xor_dma_unaligned_to;
	    asm_memzero((void *)tmp_to, xor_dma_unaligned_to);
            n -= xor_dma_unaligned_to;
	    ua++;
	}

	/*
	 * OK! We should now be fully aligned on both ends. 
	 */
        chan = allocate_channel();
        if ( chan == -1)
         {
                DPRINTK("XOR engines are busy, return\n");
       		xor_memzero_miss++;
		return asm_memzero(to, n);
        }
        if (down_trylock(&meminit_sema))
        {
            DPRINTK("meminit is used by one of the XOR engines\n", chan);
            xor_memzero_miss++;
            free_channel(&xor_channel[chan]);
	    return asm_memzero(to, n);
        }

	DPRINTK("setting up rest of descriptor for channel %d\n", chan);
        channel = &xor_channel[chan];
	
        /* Ensure that the cache is clean */
	dmac_inv_range((unsigned long)to, (unsigned long)to + n);

	channel->chan_active = 1;

	DPRINTK("setting up rest of descriptor\n");
        if( mvXorMemInit(chan, virt_to_phys(to), n, 0, 0) != MV_OK)
        {
            printk(KERN_ERR "%s: DMA memzero operation on channel %d failed. to %p len %d!\n", __func__, chan,
                to, n);
            free_channel(channel);
            up(&meminit_sema);
       	    return asm_memzero(to, n);
        }
        xor_waiton_eng(chan);


        DPRINTK("DMA memzero complete\n");
	// check to see if failed
        up(&meminit_sema);
        free_channel(channel);
	xor_memzero_hit++;
	if (ua)
		xor_memzero_unaligned++;

}
EXPORT_SYMBOL(dma_memzero);
#endif

#ifdef CONFIG_ENABLE_XOR_INTERRUPTS
static irqreturn_t
mv_xor_isr(int irq, void *dev_id, struct pt_regs *regs)
{
    MV_U32  reg;

    reg = MV_REG_READ(XOR_CAUSE_REG);
    MV_REG_WRITE(XOR_CAUSE_REG, ~reg);
    DPRINTK("%s: cause 0x%08x, dev_id %d",__func__, reg, (int)dev_id);
    if(reg & BIT1)
    {
       wake_up(&xor_channel[0].waitq);
    }
    if(reg & BIT17)
    {
       wake_up(&xor_channel[1].waitq);
    }
    /*ignore access protection*/ 
    if( reg & ~(BIT20|BIT17|BIT16|BIT4|BIT1|BIT0))
    {
        printk("%s error: cause register 0x%08x\n", __func__, reg);
    }
    return IRQ_HANDLED;
}
#endif
static int xor_read_proc(char *buf, char **start, off_t offset, int len,
			 int *eof, void *data)
{
	len = 0;

	len += sprintf(buf + len, "Number of XOR hits: %d\n", xor_hit);
	len += sprintf(buf + len, "DMA memcopy hits: %d\n", xor_dma_hit);
	len += sprintf(buf + len, "DMA memcopy misses: %d\n", xor_dma_miss);
	len += sprintf(buf + len, "DMA memcopy unaligned buffers: %d\n", xor_dma_unaligned);
	len += sprintf(buf + len, "DMA memzero hits: %d\n", xor_memzero_hit);
	len += sprintf(buf + len, "DMA memzero misses: %d\n", xor_memzero_miss);
	len += sprintf(buf + len, "DMA memzero unaligned buffers: %d\n", xor_memzero_hit);
        len += sprintf(buf + len, "copy to/from user DMA min buffer %ld\n", mv_dma_min_buffer);
	len += sprintf(buf + len, "Number of DMA copy to user %d copy from user %d \n",
                                    dma_to_user, dma_from_user);
#ifdef RT_DEBUG
	len += sprintf(buf + len, "copy to/from user dma activations %d\n", dma_activations);
#endif
	return len;
}

int mv_xor_init(void)
{
    int chan;
#ifdef CONFIG_ENABLE_XOR_INTERRUPTS
    int err = 0;
#endif
    char *mode = "acceleration";

#ifdef CONFIG_ENABLE_XOR_INTERRUPTS
    mode = "offloading";
#endif
    
    printk(KERN_INFO "Use the XOR engines (%s) for enhancing the following functions:\n", mode);
#ifdef CONFIG_MV_RAID5_XOR_OFFLOAD
    printk(KERN_INFO "  o RAID 5 Xor calculation\n");
#endif
#ifdef CONFIG_MV_DMACOPY
    printk(KERN_INFO "  o kernel memcpy\n");
#endif
#ifdef CONFIG_MV_DMAMEMZERO
    printk(KERN_INFO "  o kenrel memzero\n");
#endif
#ifdef CONFIG_MV_USE_XOR_FOR_COPY_USER_BUFFERS
    printk(KERN_INFO "  o copy user to/from kernel buffers\n");
#endif
    printk(KERN_INFO "Number of XOR engines to use: %d\n", XOR_MAX_CHANNELS);

    if(mvCtrlModelGet() == MV_5082_DEV_ID)
    {
        printk(KERN_WARNING " This device doesn't have XOR engines.\n");    
        return -ENODEV;
    }
    mvXorInit();

    /* pre-alloc XOR descriptors */
    pDescriptors = dma_alloc_coherent(NULL, sizeof(MV_XOR_DESC) * XOR_MAX_CHANNELS,
                                            &descsPhyAddr, GFP_KERNEL);  
    if(pDescriptors == NULL)
    {
        printk(KERN_ERR "%s: failed to allocate XOR descriptors\n", __func__);
        return -ENOMEM;
    }
    sema_init(&meminit_sema, 1);
    memset(pDescriptors, 0, sizeof(MV_XOR_DESC) * XOR_MAX_CHANNELS);
    DPRINTK(" allocating XOR Descriptors: virt add %p, phys addr %x\n", 
        pDescriptors, descsPhyAddr);
    for(chan = 0; chan  < XOR_MAX_CHANNELS; chan++)
    {
	xor_channel[chan].chan_num = chan;
        xor_channel[chan].pDescriptor = pDescriptors + chan;
        xor_channel[chan].descPhyAddr = descsPhyAddr + (sizeof(MV_XOR_DESC) * chan);
	xor_channel[chan].chan_active = 0;

        sema_init(&xor_channel[chan].sema, 1);
        init_waitqueue_head(&xor_channel[chan].waitq);
        mvXorCtrlSet(chan, (1 << XEXCR_REG_ACC_PROTECT_OFFS) | 
                    (4 << XEXCR_DST_BURST_LIMIT_OFFS) |
                    (4 << XEXCR_SRC_BURST_LIMIT_OFFS));
#ifdef CONFIG_ENABLE_XOR_INTERRUPTS
        switch(chan)
        {
            case 0:
                xor_channel[chan].irq_num = XOR0_IRQ_NUM;
                xor_channel[chan].name = "xor_chan0";
                break;
            case 1:
                xor_channel[chan].irq_num = XOR1_IRQ_NUM;
                xor_channel[chan].name = "xor_chan1";
            break;
            default:
                printk(KERN_ERR "%s: trying to configure bad xor channel\n", __func__);
                return -ENXIO; 
        }
        err = request_irq(xor_channel[chan].irq_num, mv_xor_isr, SA_INTERRUPT,
				  xor_channel[chan].name, (void *)chan);
        if (err < 0)
        {
            printk(KERN_ERR "%s: unable to request IRQ %d for "
                            "XOR %d: %d\n", __func__, XOR0_IRQ_NUM, chan, err);
        	return -EBUSY;
        }
        MV_REG_WRITE(XOR_MASK_REG,0xFFEFFFEF); 
#endif
    }
#ifdef CONFIG_PROC_FS
	xor_read_proc_entry =
        create_proc_entry("mv_xor", S_IFREG | S_IRUGO, 0);
        xor_read_proc_entry->read_proc = xor_read_proc;
        xor_read_proc_entry->write_proc = NULL;
	xor_read_proc_entry->nlink = 1;
#endif
    xor_engine_initialized = 1;
    return 0;
}

void mv_xor_exit(void)
{
    printk(KERN_INFO "XOR acceleration exit\n");
    return;
}
module_init(mv_xor_init);
module_exit(mv_xor_exit);
MODULE_LICENSE(GPL);



