/*#define DOUBLE_CHECK_RESULT*/
/*#define DMA*/

#ifdef SIMONLY
#define REALHARD 0
#else
#define REALHARD 1
#endif

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/resource.h>

#ifndef GCC
#include <c_asm.h>
#endif
#include "memmap.h"
#include "phibutil.h"
#include <sys/time.h>

#define DEFINE_IN_GRAPPE6UTIL 

#include "../grape6/g6chip_simlib.h"
#include "grape6util-local.h"
#include "grape6util.h"

#ifndef LINUX
#ifdef __linux__
#define LINUX 1
#endif
#endif
#define NCLUSTERS (4)
#define DEVID (3)
#define MAXNCHIPS (NCLUSTERS*MAXCHIPSPERCLUSTER)

#define MAXSIZE (0x8000)    /*max data size (=0x20000(=128k) by BYTE)*/
#define SETCOUNT (0x8000)   /*set counter value  (=0x20000 by BYTE)*/
#define GETCOUNT (0x10000)  /*get counter value (=0x40000 by BYTE)*/
#define LINK1BASE (0x10000)  /*output to link1 (=0x40000 by BYTE)*/
#define LINK2BASE (0x18000)  /*output to link2 (=0x60000 by BYTE)*/	     
#define DPRAMBASE (0x20000)  /*read from DPram (=0x80000 by BYTE)*/
#define WRITEBASE (0X00)    /*write address offset by word*/

#ifdef X86
#    define PREFER32BIT 1
#else
#    define PREFER32BIT 0
#endif

#define NBLOCKMAX 32
/* max number of particles to be sent by one DMA
   used to be 20 (before 2002/2/17), which caused too high
   error rate on ECS K7S6A operating at 133 MHz FSB
   set to 10 on 2002/2/17
   test to increase to 16 on 2002/7/24
   to 38 on 2002/7/25 */

/* unions for type conversion */
typedef union datapack{
    unsigned int i32[2];
    ULONG        i64;
    double       d;
}DATAPACK;

typedef union{
    unsigned int i32;
    float f;
}SHORTPACK;


#define JPWAITFACTOR 25
unsigned int *hib[NCLUSTERS];
unsigned int hib_ijp_state[NCLUSTERS];
unsigned int hib_sending_jp[NCLUSTERS];
unsigned int hib_system_version[NCLUSTERS];
unsigned int hib_jplinkid[NCLUSTERS];

static int mccount = 1;

static int waitfactor;

static double cpu_second()			
{
    struct rusage usage;
    if(getrusage(RUSAGE_SELF,&usage)){
	fprintf(stderr,"getrusage failed\n");
    }
    return  usage.ru_utime.tv_sec + usage.ru_utime.tv_usec*1e-6;
}
static double tstart;
static struct timeval timearg;
static struct timezone zonearg;
static double wall_second()
{
    
    if(gettimeofday(&timearg,&zonearg)){
	fprintf(stderr,"Timer failed\n");
    }
    return  timearg.tv_sec + timearg.tv_usec*1e-6 - tstart;
}

double g6_cpu_second()
{
    return cpu_second();
}

double g6_wall_second()
{
    return wall_second();
}

void dummyloop(int count)
{
    int i;
    for(i=0;i<count;i++){
	g6_dummywait();
    }
}

void calibrate_dummyloop()
{
    double t0, t1;
    t0 = cpu_second();
    dummyloop(40000000);
    t1 = cpu_second()-t0;
    waitfactor = 40000000/t1;
    fprintf(stderr," Dummy loop wait factor = %d\n", waitfactor);
}

void uwait(int waitusec)
{
    dummyloop((int)(waitusec*1e-6*waitfactor));
}
void nwait(int waitusec)
{
    dummyloop((int)(waitusec*1e-9*waitfactor));
}


int getlinuxversiontype()
     /* 0: 2.2.x
	1: 2.4.x
	2: 2.6.x
	-1: Non-Linux systems
	3 and higher: reserved */
{
#ifndef LINUX
    return -1;
#else    
    FILE* infile;
    char versionstring[100];
    fprintf(stderr,"open /proc/version\n"); 
    infile = fopen("/proc/version","r");
    fscanf(infile,"Linux version %s", versionstring);
    fprintf(stderr,"Linux version = %s\n", versionstring);
    if(versionstring[0] != '2'){
	fprintf(stderr,"Unkown major version for verion %s return -1\n ", versionstring);
	return -1;
    }
    if(versionstring[2] == '2'){
	fprintf(stderr,"Linux verion 2.2.x\n ");
	return 0;
    }
    if(versionstring[2] == '4'){
	fprintf(stderr,"Linux verion 2.4.x\n ");
	return 1;
    }
    if(versionstring[2] == '6'){
	fprintf(stderr,"Linux verion 2.6.x\n ");
	return 2;
    }
	fprintf(stderr,"Unknown minor version\n ");
    return 3;
#endif
}

void g6_setmccount_(int * newval)
{
    mccount = *newval;
}
void g6_setmccount(int newval)
{
    g6_setmccount_(&newval);
}

int g6_mccount_()
{
    return mccount;
}

void PCIdummyaccess(int boardid);


void setslowhibdmamode(unsigned int devid)
{
    int data = 0x0c3; /* local burst disabled loc8 = 0 */
                      /* 1 Wait loc2 = 1 */
                      /* bit 7: BTERM (if 0, local burst=4Lwords)
    /*    fprintf(stderr,"HIB dma mode set to %x\n",  data);*/
    TBregWrite(devid, 0x80, data);
    PCIdummyaccess(devid);
}
void sethibdmamode(unsigned int devid)
{
    int data = 0x1c3;
    /*    fprintf(stderr,"HIB dma mode set to %x\n",  data);*/
    TBregWrite(devid, 0x80, data);
    PCIdummyaccess(devid);
}

void g6hib_dmaw(unsigned int devid,unsigned long pcioff, unsigned int size)
{
    TBregWrite(devid, 0x84, pcioff);
    /*#define     DMA_BROKEN_FIRSTWORD*/
#ifdef      DMA_BROKEN_FIRSTWORD
    TBregWrite(devid, 0x88, 0x80004);/* start address ... sometimes 80004 seems
					necessary ... why? */
#else
    TBregWrite(devid, 0x88, 0x80000);
#endif
    TBregWrite(devid, 0x8c, size*4); /* size in byte */
    TBregWrite(devid, 0x90, 0x00000008); /* direction (8:local --> PCI) */
    TBregWrite(devid, 0xa8, 0x00000003); /* start DMA */
}

void g6hib_dmaw_with_localoffset(unsigned int devid,
				 unsigned long pcioff,
				 unsigned int localoff,
				 unsigned int size)
{
    TBregWrite(devid, 0x84, pcioff);
    TBregWrite(devid, 0x88, 0x80000+localoff*4);
    TBregWrite(devid, 0x8c, size*4); /* size in byte */
    TBregWrite(devid, 0x90, 0x00000008); /* direction (8:local --> PCI) */
    TBregWrite(devid, 0xa8, 0x00000003); /* start DMA */
}


void g6hib_dmar(unsigned int devid,
		unsigned long pcioff,
		unsigned long localoff,
		unsigned int size)
{
    TBregWrite(devid, 0x84, pcioff);
    TBregWrite(devid, 0x88, localoff);
    TBregWrite(devid, 0x8c, size*4); /* size in byte */
    TBregWrite(devid, 0x90, 0x00000000); /* direction (0:PCI-->local) */
    TBregWrite(devid, 0xa8, 0x00000003); /* start DMA */
}




int g6hib_dma_status(unsigned int devid);

void wait_dma_to_finish(unsigned int boardid)
{
#if REALHARD    
    while(g6hib_dma_status(boardid)){
	uwait(3);
    }
#endif
}

#if REALHARD
int linkopen(int boardid, int waittime)
{
    hib[boardid] = (unsigned int*)TBopen(boardid,waittime);
    if (((long int)hib[boardid]) == 0){
	return -1;
    }
    return 0;
}
#else
int linkopen(int boardid, int waittime){return 0;}
#endif

void linkclose(int boardid)
{
    TBterm(boardid);
}

void reset_board(int boardid)
{
#if REALHARD
       unsigned int tempadr;
       unsigned int tempdata;
       unsigned int writedata;
       unsigned int mask;
       unsigned int i;
       
       tempadr = 0x6c;

#ifdef INTERNAL_OUT0
       fprintf(stderr,"g6utils -- resetting HIB %d\n", boardid);
#endif
       /*       linkopen(boardid);*/
       mask = 0x00ffffff;
       tempdata = (TBregRead(boardid,tempadr) & mask);
       writedata = (0xd8000000|tempdata);
       TBregWrite(boardid,tempadr,writedata);
       while((TBregRead(boardid,tempadr)&0xff000000)!=0xd8000000){
	   fprintf(stderr,"g6reset adr, retval = %x %x\n",
		   tempadr, TBregRead(boardid,tempadr));
       }
       for (i=0; i<10000; i++){}
       tempdata = (TBregRead(boardid,tempadr) & mask);
       writedata = (0xb8000000|tempdata);
       TBregWrite(boardid,tempadr,writedata);

       while((TBregRead(boardid,tempadr)&0xff000000)!=0xb8000000){}
       for (i=0; i<10000; i++){}
       tempdata = (TBregRead(boardid,tempadr) & mask);
       writedata = (0x98000000|tempdata);
       TBregWrite(boardid,tempadr,writedata);
       usleep(90000);
#endif
       
}


void g6set_aux(unsigned int boardid,
	       unsigned int auxval0,
	       unsigned int auxval1);


#ifndef X86
void linkwrite_sub_4words(unsigned int * source, unsigned int * destination, int size)
{
    int i;
    int size4 = size >> 3;
    register unsigned long * pp = (unsigned long *)source;
    register unsigned long * qq = (unsigned long *) destination;
    register unsigned int * p;
    register unsigned int * q;
    /*    fprintf(stderr,"sub_4words, size = %d\n", size);*/
    for(i=0;i<size4;i++, pp += 4, qq += 4){
	register unsigned long v0,v1,v2,v3;
	    v0= *pp;
	    v1= *(pp+1);
	    v2 = *(pp+2);
	    v3 = *(pp+3);
	    *qq=      v0;
	    *(qq+1) = v1;
	    *(qq+2) = v2;
	    *(qq+3) = v3;
    }
    /*
     * Note (2000/9/27)
     * Use of long here and hand unrolling give some performance improvement
     * on EV6 (UP1000/600 MHz). Simple loop (the below form) gives 56MB/s for G6HIB,
     * while the above form gives around 72MB/s.
     *
     * Assuming that the burst length is limited to 8, these numbers corresponds to
     * total cycle count of 19 and 15, respectively.
     */
    size4 *= 8;
    if (size4 != size){
	p=source+size4;
	q=destination+size4;
	for(i=size4 ;i<size;i++, p++, q++){
	    *q = *p;
	}
    }
}
#else /* End of non-x86 code */

void linkwrite_sub_4words(unsigned int * source, unsigned int * destination, int size)
{
    int i;
    int size4 = size >> 2;
    register unsigned int * p;
    register unsigned int * q;
    /*    fprintf(stderr,"sub_4words, size = %d, %d\n", size, sizeof(long));*/
    p = source;
    q = destination;  
    for(i=0 ;i<size;i++, p++, q++){
	*q = *p;
    }
}
#endif /* X86 */
void set_ijp_mode(int boardid)
{
    if(hib_system_version[boardid]>=200) {
      /* system version 2... check and change the AUX status */
      if (hib_sending_jp[boardid] != 0){
          if (hib_ijp_state[boardid] == 0){
              g6set_aux(boardid, 1, 1);
              hib_ijp_state[boardid] = 1;
          }
      }else{
          if (hib_ijp_state[boardid] == 1){
              g6set_aux(boardid, 1, 0);
              hib_ijp_state[boardid] = 0;
          }
      }
    }
}

void setlinkmode(int boardid,int link)
{
    int changed = 0;
    if((hib_system_version[boardid]>=200) && (link >0)){
	/* system version 2... check and change the AUX status */
	if (hib_sending_jp[boardid] != 0){
	    if (hib_ijp_state[boardid] == 0){
		g6set_aux(boardid, 1, 1);
		hib_ijp_state[boardid] = 1;
		changed = 1;
#ifdef INTERNAL_OUT
		fprintf(stderr,"linkwrite, board %d link %d sending_jp=%d ijp_state=%d\n",
			boardid, link, hib_sending_jp[boardid],
			hib_ijp_state[boardid]);
#endif
	    }
	}else{
	    if (hib_ijp_state[boardid] == 1){
		g6set_aux(boardid, 1, 0);
		hib_ijp_state[boardid] = 0;
		changed = 1;
#ifdef INTERNAL_OUT		
		fprintf(stderr,"linkwrite, board %d link %d sending_jp=%d ijp_state=%d\n",
			boardid, link, hib_sending_jp[boardid],
			hib_ijp_state[boardid]);
#endif		
	    }
	}
    }
    if (changed){
	PCIdummyaccess(boardid);
	uwait(1);
    }
}
    

#if REALHARD
int linkwrite(int boardid,int link,int size, unsigned int * buf)
     /* boardid: index of G6HIB (if there are multiple boards),
	starts from 0*/
     /* link: this is BAD name... 0: internal registers
	                         1,2: links 1, 2 */
{
    int i;
    int adrbase;
    int errcode =0;
    unsigned int * hbufp;
    register unsigned int * bufp = buf;
    setlinkmode(boardid, link);
    adrbase = setbaseadr(link);
    hbufp = hib[boardid]+(adrbase+WRITEBASE);

    linkwrite_sub_4words(buf, hbufp, size);

    return(errcode);
}
int linkwrite_memcpy(int boardid,int link,int size, unsigned int * buf)
     /* boardid: index of G6HIB (if there are multiple boards),
	starts from 0*/
     /* link: this is BAD name... 0: internal registers
	                         1,2: links 1, 2 */
{
    int i;
    int adrbase;
    int errcode =0;
    unsigned int * hbufp;
    register unsigned int * bufp = buf;
    setlinkmode(boardid, link);
    adrbase = setbaseadr(link);
    hbufp = hib[boardid]+(adrbase+WRITEBASE);
    memcpy(hbufp,buf,sizeof(int)*size);
    return(errcode);
}
#else
int linkwrite(int boardid,int link,int size, unsigned int * buf)
{
    return 0;
}
#endif

void linkwrite_oneword(int boardid,int link,unsigned int data)
{
    linkwrite(boardid,link,1, &data);
    MB;
}

void linkwrite_onedata(int boardid,
		       int link,
		       unsigned int address,
		       unsigned int data)
{
    unsigned int buf[3];
    buf[0] = address;
    buf[1] = 1;
    buf[2] = data;
#ifdef INTERNAL_OUTX
    fprintf(stderr,"linkwrite_onedata %x %x\n", address, data);
#endif    
    linkwrite(boardid,link,3, buf);
    MB;
}

void dpramdump(int boardid,int size)
{
    int i;
    int adrbase;
    unsigned int errcode = 0;
    fprintf(stderr,"dpramdump called... %d\n", size);
    
    adrbase = DPRAMBASE;
    for(i=0;i<size;i++){
	fprintf(stderr,"dpram[%d]=%x\n",i, ((int *)hib[boardid])[adrbase+i]);
    }
}

#ifndef  X86
int linkread_with_offset(int boardid,
			 int offset,
			 int size,
			 unsigned int *buf)
{
    register unsigned long v0,v1,v2,v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
    int i;
    int adrbase,lsize;
    register unsigned long * memp;
    register unsigned long * memp0;
    register unsigned long * bufp;
    int memoffset = offset>>1;
    int memoffset_mask = (MAXSIZE >>1)-1;
    unsigned int errcode = 0;
    adrbase = (DPRAMBASE)>>1;
    memp0 = ((unsigned long *)hib[boardid])+ adrbase;
    lsize=size>>1;
    bufp = (unsigned long *) buf;

#define blen 16
    for(i=0;i<lsize;i+=blen,  bufp += blen){
	memp = memp0 + memoffset;
	{
	    v0 = *memp;
	    v1 = *(memp+1);
	    v2 = *(memp+2);
	    v3 = *(memp+3);
	    v4 = *(memp+4);
	    v5 = *(memp+5);
	    v6 = *(memp+6);
	    v7 = *(memp+7);
	    v8 = *(memp+8);
	    v9 = *(memp+9);
	    v10 = *(memp+10);
	    v11 = *(memp+11);
	    v12 = *(memp+12);
	    v13 = *(memp+13);
	    v14 = *(memp+14);
	    v15 = *(memp+15);
	}
	memoffset = (memoffset + blen )& memoffset_mask;
	{
	    *bufp = v0;
	    *(bufp+1) = v1;
	    *(bufp+2) = v2;
	    *(bufp+3) = v3;
	    *(bufp+4) = v4;
	    *(bufp+5) = v5;
	    *(bufp+6)= v6;
	    *(bufp+7)= v7;
	    *(bufp+8) = v8;
	    *(bufp+9) = v9;
	    *(bufp+10) = v10;
	    *(bufp+11) = v11;
	    *(bufp+12) = v12;
	    *(bufp+13)= v13;
	    *(bufp+14)= v14;
	    *(bufp+15)= v15;
	}
    }
#ifdef INTERNAL_OUT0    
    for(i = 0; i<size; i++){
	fprintf(stderr,"(linkread %3d %8x\n", i, buf[i]);
    }
#endif    
    return(errcode);
}
#else
int linkread_with_offset(int boardid,
			 int offset,
			 int size,
			 unsigned int *buf)
{
    register unsigned long v0,v1,v2,v3;
    int i;
    int adrbase,lsize;
    register unsigned long * memp;
    register unsigned long * memp0;
    register unsigned long * bufp;
    int memoffset = offset;
    int memoffset_mask = (MAXSIZE )-1;
    unsigned int errcode = 0;
    adrbase = (DPRAMBASE);
    memp0 = ((unsigned long *)hib[boardid])+ adrbase;
    lsize=size;
    bufp = (unsigned long *) buf;

#define blen 4
    for(i=0;i<lsize;i+=blen,  bufp += blen){
	memp = memp0 + memoffset;
	{
	    v0 = *memp;
	    v1 = *(memp+1);
	    v2 = *(memp+2);
	    v3 = *(memp+3);
	}
	memoffset = (memoffset + blen )& memoffset_mask;
	{
	    *bufp = v0;
	    *(bufp+1) = v1;
	    *(bufp+2) = v2;
	    *(bufp+3) = v3;
	}
    }
#ifdef INTERNAL_OUT0    
    for(i = 0; i<size; i++){
	fprintf(stderr,"(linkread %3d %8x\n", i, buf[i]);
    }
#endif    
    return(errcode);
}
#endif

int linkread_with_offset_slow(int boardid,
			 int offset,
			 int size,
			 unsigned int *buf)
{
    register unsigned long v0,v1,v2,v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
    int i;
    int adrbase,lsize;
    register unsigned long * memp;
    register unsigned long * memp0;
    register unsigned long * bufp;
    int memoffset = offset>>1;
    int memoffset_mask = (MAXSIZE >>1)-1;
    unsigned int errcode = 0;
    adrbase = (DPRAMBASE)>>1;
    memp0 = ((unsigned long *)hib[boardid])+ adrbase;
    lsize=size>>1;
    bufp = (unsigned long *) buf;
    
    for(i=0;i<lsize;i+=1,  bufp += 1){
	memp = memp0 + memoffset;
	{
	    v0 = *memp;
	}
	memoffset = (memoffset + blen )& memoffset_mask;
	{
	    *bufp = v0;
	}
    }
#ifdef INTERNAL_OUT0    
    for(i = 0; i<size; i++){
	fprintf(stderr,"(linkread %3d %8x\n", i, buf[i]);
    }
#endif    
    return(errcode);
}



int linkread(int boardid,
	     int size,
	     unsigned int *buf)
{
    return linkread_with_offset(boardid,0, size, buf);
}




void PCIdummyaccess(int boardid)
{
    int i;
    for(i=0;i<1;i++){
#ifdef X86	
	TBmemWrite(boardid,SETCOUNT+i,0x0);
#endif
	MB;
    }
}


int counterclear(int boardid)
{
    int errcode=0;
    int i;
    if ((TBmemRead(boardid,GETCOUNT)  & 0x60000)!= 0){
	fprintf(stderr,"Warning: HIB FIFO write error %x\n", TBmemRead(boardid,GETCOUNT));
    }

    for(i=0;i<2;i++){
	TBmemWrite(boardid,SETCOUNT,0x0);
	MB;
    }

    if (TBmemRead(boardid,GETCOUNT) != 0){
	errcode = 0x16;
	fprintf(stderr,"Counter clear failed! %x\n", TBmemRead(boardid,GETCOUNT));
    }
    return(errcode);
}
 
int g6hib_getcounter(int boardid)
{
    unsigned int m;
    m=(TBmemRead(boardid,0x10000)>>2) & 0x7fff;
    return (int) m;
}

int g6hib_foerror(int boardid, int ni)
{
    /* this function is quick hack... does not work correctly with Neighbor list */
    unsigned int m, adr;
    m=TBmemRead(boardid,0x30000);
    adr =( m >>2) &0xfffff;
    if ((adr < (ni*14))&&((m>>28) != 0)){
	fprintf(stderr,"g6hib_foerror: error address=%x\n",adr);
#ifdef TESTMAIN
	return 0;
#endif	
	return (int) (m>>28);
    }else{
	return 0;
    }
    
}
void g6hib_foerror_clear(int boardid)
{
    int perror;
    unsigned int m;
    do{
	m=TBmemRead(boardid,0x00000);
	MB;
	m=TBmemRead(boardid,0x30000);
	perror = m>>28;
	if (perror != 0){
	    fprintf(stderr,"foerror_clear, failed... retrying %x %x\n",m,perror);
	    g6reset(boardid); /* this line added on 2000/7/9 */
	}
    }while(perror != 0);
	
}

void g6hib_printcounter(int board)
{
    fprintf(stderr,"G6HIB counter = %x\n", g6hib_getcounter(board));
}



int varcheckw(int boardid,int link,int size)
{
    int errcode = 0;
    
    errcode = idcheck(boardid,errcode);
    errcode = linkcheck(link,errcode);
    errcode = sizecheck(size,errcode);
    errcode = countercheck(boardid,errcode);
    return (errcode);
}

int varcheckr(int boardid,int size)
{
    int errcode = 0;
    
    errcode = idcheck(boardid,errcode);
    errcode = sizecheck(size,errcode);
    errcode = countercheck(boardid,errcode);
    return (errcode);
}

int idcheck(int boardid,int errcode)
{
    if ((boardid < 0) || (boardid > DEVID)){
	errcode = errcode | 0x1;
    }
    return (errcode);
}

int linkcheck(int link,int errcode)
{
    if ((link <1) || (link > 2)){
	errcode = errcode | 0x2;
    }
    return (errcode);
}

int sizecheck(int size,int errcode)
{
    if (size > MAXSIZE){
	errcode = errcode | 0x4;
    }
    return (errcode);
}

int countercheck(int boardid,int errcode)
{
    if (counterclear(boardid) != 0){
	errcode = errcode | 0x8;
    }
    return (errcode);
}

int setbaseadr(int link)
{
    int adrbase;
    
    if (link == 1){
	adrbase = LINK1BASE;
    } else if (link == 2) {
        adrbase = LINK2BASE;
    } else {
	adrbase = 0;
    }
    return (adrbase);
}



GRAPE6_CLUSTER_PTR  clusters[G6MAXCLUSTERS] =
{NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
 NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
 NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
 NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL};

#if 0
/* Old style: assign static storage if applicable */
#ifdef DMA
#define NEWPHIB
#ifndef NEWPHIB
static unsigned int dma_real_work_area[NCLUSTERS][DMA_MAX_SIZE*3];
static unsigned int * dma_work_area[NCLUSTERS];
#else
static unsigned int * dma_work_area[NCLUSTERS];
#endif
#else
static unsigned int dma_work_area[NCLUSTERS][DMA_MAX_SIZE*2];
#endif
#else
/* New style: always use malloc or ioctl to allocate memory */
static unsigned int * dma_work_area[NCLUSTERS];
static unsigned int dma_real_work_area[NCLUSTERS][DMA_MAX_SIZE*3];
#endif

static unsigned int fo_work_area[NCLUSTERS][DMA_MAX_SIZE];

int linkread_dma(int clusterid,
		 int size,
		 unsigned int *buf)
{
    int i;
    g6hib_dmaw(clusters[clusterid]->foboardid,clusters[clusterid]->dma_offset,
	       size);
    uwait((size)/33);
    while(g6hib_dma_status(clusters[clusterid]->foboardid)){
	uwait(2);		  
    }
    for(i =0;i<size;i++)buf[i]=dma_work_area[clusterid][i];
    return 0;
}

int linkread_dma_with_localoffset(int clusterid,
				  int size,
				  int localoffset,
				  unsigned int *buf)
{
    int i;
    g6hib_dmaw_with_localoffset(clusters[clusterid]->foboardid,
				clusters[clusterid]->dma_offset,
				localoffset,size);
    uwait((size)/33);
    while(g6hib_dma_status(clusters[clusterid]->foboardid)){
	uwait(2);
    }
    for(i =0;i<size;i++)buf[i]=dma_work_area[clusterid][i];
    return 0;
}

int linkwrite_dma_test256(int clusterid,int link,int size, unsigned int * buf)
     /* clusterid: note that this is different from linkwrite, which
	takes the boardid itself.
	starts from 0*/
     /* link: this is BAD name... 0: internal registers
	                         1,2: links 1, 2 */
{
    int i;
    int adrbase;
    int boardid;
    int errcode =0;
    unsigned int * hbufp;
    register unsigned int * bufp = buf;
    static int dmabufoff = 0;
    boardid = clusters[clusterid]->ipboardid;
    PCIdummyaccess(boardid);
    wait_dma_to_finish(boardid);
    PCIdummyaccess(boardid);
    adrbase = setbaseadr(link);
    for(i=0;i<size;i++) dma_work_area[clusterid][i+dmabufoff] = buf[i];
    for(i=size-1;i>=0;i--) dma_work_area[clusterid][i+dmabufoff] = buf[i];
    PCIdummyaccess(boardid);
    setlinkmode(boardid, link);
    PCIdummyaccess(boardid);
    g6hib_dmar(boardid,
	       clusters[clusterid]->dma_offset+(dmabufoff*sizeof(int)),
	       adrbase*sizeof(int), size);
    dmabufoff = (dmabufoff+256) % 512;
    PCIdummyaccess(boardid);
    return(errcode);
}
int linkwrite_dma(int clusterid,int link,int size, unsigned int * buf)
     /* clusterid: note that this is different from linkwrite, which
	takes the boardid itself.
	starts from 0*/
     /* link: this is BAD name... 0: internal registers
	                         1,2: links 1, 2 */
{
    int i;
    int adrbase;
    int boardid;
    int errcode =0;
    unsigned int * hbufp;
    register unsigned int * bufp = buf;
    boardid = clusters[clusterid]->ipboardid;
    PCIdummyaccess(boardid);
    wait_dma_to_finish(boardid);
    PCIdummyaccess(boardid);
    adrbase = setbaseadr(link);
    for(i=0;i<size;i++) dma_work_area[clusterid][i] = buf[i];
    PCIdummyaccess(boardid);
    setlinkmode(boardid, link);
    PCIdummyaccess(boardid);
    g6hib_dmar(boardid, clusters[clusterid]->dma_offset, adrbase*sizeof(int), size);
    PCIdummyaccess(boardid);
    return(errcode);
}

void g6_set_ijp_mode(int clusterid,
		     int mode) /* mode=0: IP, 1: JP */
{
    int boardid =  clusters[clusterid]->ipboardid;
    hib_sending_jp[boardid] = mode;
    set_ijp_mode(boardid);
}
void g6_set_ijp_mode_(int * clusterid,
		      int * mode) /* mode=0: IP, 1: JP */
{
    g6_set_ijp_mode(*clusterid,* mode);
}


void g6_setmcetc(int newval,int clusterid)
{
    GRAPE6_CLUSTER_PTR cp;
    g6_setmccount_(&newval);
    cp = clusters[clusterid];
    cp->nchipsperboard = cp->nchips /mccount; 
    cp->mccount = mccount;
}

void g6errorcounter_clear(int clusterid)
{
    struct grape6_errorcounters * ecp = &(clusters[clusterid]->errorcounters);
    ecp->jpperr_count = 0;
    ecp->ipperr_count = 0;
    ecp->memecc_count = 0;
    ecp->memuncorrect_count = 0;
    ecp->cmerror_count = 0;
}

void g6errorcounter_dump(clusterid)
     
{
    struct grape6_errorcounters * ecp = &(clusters[clusterid]->errorcounters);
    fprintf(stderr,"Errors: jp, ip, ecc, ecc(u), cm = %d %d %d %d %d\n",
	    ecp->jpperr_count, ecp->ipperr_count,
	    ecp->memecc_count,ecp->memuncorrect_count,
	    ecp->cmerror_count);
}


void g6set_aux(unsigned int boardid,
	       unsigned int auxval0,
	       unsigned int auxval1)
{
    unsigned int data =  0xffffff00 | ((auxval1 <<4)&0xf0)| (auxval0)&0xf;
#ifdef INTERNAL_OUT
    fprintf(stderr, "set aux, auxval0=%x auxval1=%x data=%x\n",
	    auxval0, auxval1, data);
#endif    
    linkwrite_oneword(boardid,0,data);

}
void g6reset_pbonly(unsigned int boardid)
{
#ifdef INTERNAL_OUTn
    fprintf(stderr,"g6reset_pbonly called %d\n", boardid);
#endif    
    linkwrite_oneword(boardid,0,0xFFFFFF00);
    linkwrite_oneword(boardid,0,0xFFFFF000);
    linkwrite_oneword(boardid,0,0xFFFFFF00);
    hib_ijp_state[boardid] = 0;
    hib_sending_jp[boardid] = 0;

}

void g6reset(unsigned int boardid)
{

    reset_board(boardid);
    g6reset_pbonly(boardid);

}

void g6_reset_(int * clusterid)
{
    g6reset_pbonly(clusters[*clusterid]->ipboardid);
}

void g6_reset(int  clusterid)
{
    g6reset_pbonly(clusters[clusterid]->ipboardid);
}



void g6setledmode(unsigned int boardid,
		  unsigned int mode)
     /* mode = 1: programmable
	mode = 0: speedometer (I believe*/
{
    unsigned int link = IPLINK;
    linkwrite_oneword(boardid,link,PBLEDMODEADR);
    linkwrite_oneword(boardid,link,0x1);
    linkwrite_oneword(boardid,link,mode);
}

static void setpllmode(unsigned int boardid,unsigned int pllid,
		  unsigned int mode)
{
    unsigned int link = IPLINK;
#ifdef INTERNAL_OUT    
    fprintf(stderr,"setpllmode, address = %x, data =  %x\n",pllid,mode);
#endif
    if (pllid == 0){
	linkwrite_oneword(boardid,link,B0PLLMODEADR);
    }else{
	linkwrite_oneword(boardid,link,B1PLLMODEADR);
    }
    linkwrite_oneword(boardid,link,0x1);
    linkwrite_oneword(boardid,link,mode);
}

static void setjpspace(unsigned int boardid,
		       unsigned int version,
		       unsigned int spstart,
		       unsigned int burstlen)
     /* bit 12: version (0:100, 1:200)
	bits 11-6 space start count
	bits 5-0 total burst count
	if burstlen == 0, force NULL mode. 
	*/
{
    unsigned int vshort = 0;
    unsigned int mode; 
    unsigned int link = IPLINK;
    if (version >= 200) vshort = 1;
    mode  = (vshort<<12)|(spstart<<6)|(burstlen-1);
    if (burstlen == 0) mode = 0;
#ifdef INTERNAL_OUT
    fprintf(stderr,"(setjpspace) v, s, b = %x %x %x %x\n", 
	    version, spstart, burstlen, mode);
#endif
    linkwrite_oneword(boardid,link,JPSPACEADDR); 
    linkwrite_oneword(boardid,link,0x1);
    linkwrite_oneword(boardid,link,mode);
}

static void setcbjpmode(unsigned int boardid,
		  unsigned int mode)
{
    unsigned int link = IPLINK;
#ifdef CBTESTMAIN
    fprintf(stderr,"setcbjpmode, address = %x, data =  %x\n",CBJPMODEADR,mode);
#endif
    linkwrite_oneword(boardid,link,CBJPMODEADR);
    linkwrite_oneword(boardid,link,0x1);
    linkwrite_oneword(boardid,link,mode);
}

void g6setcbjpmode(unsigned int boardid,
		   int board_address,
		   int jp_mode,
		   int output_select)
{
    /*
     * address: 3-bit. 111 = broadcast
     * mode 0 = local broadcast, 11 1101 0100 = 3D4
     * mode 1 = 2-way multicast, 11 1010 1100 = 3AC
     * mode 2 = p-to-p,          11 0110 1010 = 36A
     *
     * output_select: 0: select 0
     *                1: select 1
     *                2: select BOTH
     * should select 0 or 2  for first CB
     */
    unsigned int data;
#if 0
    fprintf(stderr,"setcbjp, gid, address, mode, os = %d %d %d %d\n",
	    boardid, board_address, jp_mode, output_select);
#endif
    data = (board_address & 0x7) << 11;
    if (jp_mode == 0) {
	data |= (0x3d4<<2);
    }else  if (jp_mode == 1) {
	data |= (0x3ac<<2);
    }else  if (jp_mode == 2) {
	data |= (0x36a<<2);
    }else{
	data |=  (0x3ff<<2);
    }
    if (output_select == 0){
	data |=  1;
    }else if (output_select == 1){
	data |=  2;
    }else{
	data |=  3;
    }	
    setcbjpmode(boardid, data);
}

	
void g6_setcbjpmode_(unsigned int * clusterid,
		   int * board_address,
		   int * jp_mode,
		   int * output_select)
{
    g6setcbjpmode(clusters[*clusterid]->ipboardid,*board_address, *jp_mode,*output_select);
}


void setup_cbjpmode(int boardid,
                  int maxchips,
                  int mccount,
                  int versionid)
{
    int maxchipsperboard = 32;
    int nboards;
    int mode;
    if (versionid < 199) maxchipsperboard = 16;
    nboards = maxchips/maxchipsperboard;
    if (nboards > 4){
      fprintf(stderr,"setup_cbjpmode, nbs = %d (%d/%d) not implemented.\n",
              nboards, maxchips, maxchipsperboard);
      exit(-1);
    }
    if (mccount == 1){
      mode = 0;
    }else if (mccount == 2){
	if (nboards == 2){
	    mode = 2 ;
	}else if (nboards == 4){
	    mode = 1;
	}else{
	    fprintf(stderr, "setup_cbjpmode, mccount=%d  nboard = %d not possible.\n",
		    mccount, nboards);
	    exit(-1);
	}	    
    }else if (mccount == 4){
      mode = 2;
    }else{
      fprintf(stderr, "setup_cbjpmode, mccount=%d not possible.\n",
             mccount);
      exit(-1);
    }
    /*    fprintf(stderr,"setting cb mode for board %d= %d\n",boardid, mode);*/
    g6setcbjpmode(boardid,15, mode,2);
}
        
void g6_change_cbjpmode(unsigned int clusterid,
			unsigned int mode /*0: BCAST, 1: MCAST */)
{
    GRAPE6_CLUSTER_PTR cp = clusters[clusterid];
    int boardid = cp->ipboardid;
    /*    fprintf(stderr,"g6changecbjpmode  %d %d\n",clusterid, mode); */
    if (mode){
	setup_cbjpmode(boardid, cp->maxchips,cp->mccount,cp->system_version);
    }else{
	g6setcbjpmode(boardid,15, 0,2);
    }
}
	
void g6_change_cbjpmode_(unsigned int *	 clusterid,
			unsigned int *mode /*0: BCAST, 1: MCAST */)
{
     g6_change_cbjpmode(*clusterid, *mode);
}
	


void g6_led_test(unsigned int boardid)
{
    unsigned int link = IPLINK;
    unsigned int data;
    fprintf(stderr,"testing LED ... ");
    for (data = 1; data < 0x2000; data = data << 1){
	linkwrite_oneword(boardid,link,PBLEDDATAADR);
	linkwrite_oneword(boardid,link,0x1);
	linkwrite_oneword(boardid,link,data);
	fprintf(stderr,"  %x ", data);
	usleep(60000);
    }
    fprintf(stderr,"LED test end.\n");
}
void g6_led_set(unsigned int boardid, unsigned int data)
{
    unsigned int link = IPLINK;
    linkwrite_oneword(boardid,link,PBLEDDATAADR);
    linkwrite_oneword(boardid,link,0x1);
    linkwrite_oneword(boardid,link,data);
}

void test_data(unsigned int boardid, unsigned int link)
{
    unsigned int data;
    fprintf(stderr,"TEST DATA....\n");
    while (1){
	data =  0;
	linkwrite_oneword(boardid,link,data);
	data =  0xffffffff;
	linkwrite_oneword(boardid,link,data);
    }
}


void set_nconfig(int board,
		 int link,
		 unsigned int address,
		 unsigned int nconfigbits,
		 unsigned int state)
     
{
    unsigned int data;
    data = 0xfffffffe;
    if (state == 0){
	/* state == 0 means one should set specifined bits to LOW */
	data &= ~(nconfigbits <<2);
    }
#if 0
    fprintf(stderr,"set_nconfig %x %x %x\n", nconfigbits, state, data);
#endif    
    /* note that if state = 1, there is no need to modify data from all 1 */
    linkwrite_onedata(board, link, address, data);
}
void send_bit(int board,
		 int link,
		 unsigned int address,
		 unsigned int bit)
     
{
    unsigned int data;
    data = 0xffffffff;
    data &= (0xfffffffc | ((bit & 1) <<1));
    linkwrite_onedata(board, link, address, data);
    data |= 1;
    linkwrite_onedata(board, link, address, data);
}
   


/* send_fpga_data
   address: 0x420 for fofpga
   data: 
      LSB:  dclk
        1:  data0
        2-6: nconfig

   file format: ttf (comma-separatted decimal byte data)
   bit order: should send lsb first
 */
int send_fpga_data(int board,
		   int link,
		   int address,
		   int nconfigbits,
		   char * file_name)
{
    FILE * fin;
    int data, code;
    int datacount;
#if SIMONLY
    return 0;
#endif	
    /*see if the data file name is set */
    if(file_name == NULL) {
    fprintf(stderr,"(send_fpga_data) open failed, file to open: %s\n", file_name);
	return -1;
    }
    fprintf(stderr,"(send_fpga_data) file to open: %s\n", file_name);
    /* see if data file exists */
    fin = fopen(file_name,"r");
    if(fin == NULL){
	fprintf(stderr,"(send_fpga_data) failed to open defect file %s\n",
		file_name);
	return -1;
    }
    /* data file exists*/
#ifdef STEP_MODE    
    {
	int x;
	fprintf(stderr,"now about to reset NCONFIG \n");
	scanf("%x", &x);
	fprintf(stderr,"continuing...\n");
    }
#endif	
    set_nconfig(board, link, address, nconfigbits,1);
    usleep(3);

#ifdef STEP_MODE    
    {
	int x;
	fprintf(stderr,"now about to set NCONFIG \n");
	scanf("%x", &x);
	fprintf(stderr,"continuing...\n");
    }
#endif
    set_nconfig(board, link, address, nconfigbits,0);
    usleep(3);
#ifdef STEP_MODE    
    {
	int x;
	fprintf(stderr,"now about to reset NCONFIG \n");
	scanf("%x", &x);
	fprintf(stderr,"continuing...\n");
    }
#endif	
    set_nconfig(board, link, address, nconfigbits,1);
    usleep(3);
    datacount = 0;
#ifdef STEP_MODE    

    {
	int x;
	fprintf(stderr,"NCONFIG set end\n");
	scanf("%x", &x);
	fprintf(stderr,"continuing...\n");
    }
#endif
	
		
    while(fscanf(fin,"%d,",&data)==1){
	int i;
	for(i=0;i<8;i++){
	    send_bit(board,link,address,data);
	    data = data >>1;
	}
	datacount++;
	
	if ((datacount & 0xfff) == 0){
#if INTERNAL_OUT0
	    fprintf(stderr,"datacount = %x\n", datacount);
#endif
	    g6_led_set(board, datacount>>8);
	}
    }
    fclose(fin);
    return 0;
}


char * default_config_file = "/usr2/makino/src/g6hib/g6config.dat";

char * get_config_file_name_from_environment()
{
    char * p;
    p = getenv(GRAPE6_CONFIG_FILE );
    if(p == NULL) {
#ifdef USE_DEFAULT_CONFIG_FILE	
	return default_config_file;
#else
	fprintf(stderr,"(get_config_file_name_from_environment) ");
	fprintf(stderr,"env. ver. %s not set, exiting... ", GRAPE6_CONFIG_FILE);
	exit(-1);
#endif
    }else{
#if INTERNAL_OUT0
	printf("(get_config_file_name_from_environment) : p=%s\n", p);
#endif
	return p;
    }
}

char * get_full_config_file_name(int configid)
{
    char * p,* penv;
    int length;
    penv = get_config_file_name_from_environment();
    length = strlen(penv)+5;
    p = (char *)malloc(length);
    sprintf(p,"%s%d", penv, configid);
#ifdef INTERNAL_OUT0
	printf("(get_full_config_file_name) : p=%s\n", p);
#endif
    return p;
}

void dump_cluster_struct(GRAPE6_CLUSTER_PTR cp)
{
    int i;
    fprintf(stderr,"Dumping cluster struct content\n");
    fprintf(stderr,"open state = %d\n",cp->open_state);
    fprintf(stderr,"ipb        = %d\n",cp->ipboardid);
    fprintf(stderr,"fob        = %d\n",cp->foboardid);
    fprintf(stderr,"maxchips   = %d\n",cp->maxchips);
    fprintf(stderr,"mbuftype   = %d\n",cp->mbuftype);
    fprintf(stderr,"ijpdma     = %d\n",cp->ijpdma);
    fprintf(stderr,"jpspace    = %d\n",cp->jpspace);
    fprintf(stderr,"softlimit  = %d\n",cp->time_soft_limit);
    fprintf(stderr,"hardlimit  = %d\n",cp->time_hard_limit);
    fprintf(stderr,"ndefects   = %d\n defects: ",cp->ndefects);
    for(i=0;i<cp->ndefects; i++){
	fprintf(stderr," %5d",cp->defects[i]);
    }
    fprintf(stderr,"\n");
    fprintf(stderr,"realchips  = %d\n chips: ",cp->nchips);
    for(i=0;i<cp->nchips; i++){
	fprintf(stderr," %5d",cp->realchips[i]);
    }
    fprintf(stderr,"\n");
    fprintf(stderr,"sysversion = %d\n",cp->system_version);
    fprintf(stderr,"foconf     = \"%s\"\n",cp->foconf);
    if (cp->system_version >= 200){
	int ichip;
	fprintf(stderr,"modfoconf   = \"%s\"\n",cp->modfoconf);
	for(ichip = 0;ichip<NFOCHIPSONPB;ichip++){
	    fprintf(stderr,"foconf[%d]   = \"%s\"\n",ichip,cp->foconfsub[ichip]);
	}
    }
    hib_system_version[cp->ipboardid] = cp->system_version;
    fprintf(stderr,"hib system version [%d] =%d\n",
	    cp->ipboardid,hib_system_version[cp->ipboardid]);
    if (hib_system_version[cp->ipboardid]< 200){
	hib_jplinkid[cp->ipboardid] = JPLINK;
    }else{
	hib_jplinkid[cp->ipboardid] = IPLINK;
    }
	
}



void reset_fofpga(GRAPE6_CLUSTER_PTR  cluster)
{
    cluster->fofpga_state =0;
}
void g6_reset_fofpga_(int *clusterid)
{
    reset_fofpga(clusters[*clusterid]);
}
void g6_reset_fofpga(clusterid)
{
    reset_fofpga(clusters[clusterid]);
}
	
void g6_reinitialize(int clusterid)
{
	    g6_reset(clusterid);
	    g6_reset_fofpga(clusterid);
	    g6_close(clusterid);
	    g6_open(clusterid);
}
void g6_reinitialize_(int *clusterid)
{
    g6_reinitialize(*clusterid);
}

#define NMBUFTYPES 9 

char * mbufnames[NMBUFTYPES]={
    "Fairchild-LCX16374A",
    "Toshiba-LCX16374A",
    "TI-ALVCH16374",
    "Fairchild-LCX16374A-New",
    "Fairchild-LCX16374A-V3",
    "TI-ALVCH16374-HM",
    "TI-ALVCH16374-New",
    "TEST2",
    "TEST4",
};

int mbufmodedata[NMBUFTYPES][2]={0x1520,0x151,
				 0x1520,0x152,
				 0x1520,0x155,
				 0x1520,0x154,
				 0x1221,0x062,
				 0x1220,0x155,
				 0x1420,0x155,
				 0x1420,0x155,
				 0x1220,0x155,
                             };

/*
  B0PLL[0] <- mbufmoddata(1:0) 00->0 01->Z, 10,11->1;
  B0PLL[1] <- mbufmoddata(3:2) ...
  B0PLL[6:4] <- ZZZ fixed (ijpfpga2 only)

  B1PLL[4] <- Z fixed (ijpfpga2 only)

  --B0 PLL  Xtal-> CLK, BCLK

1Q: FB, 3.3V board clock(CLKC) ckbuf2
2Q: PLL1
3Q: BCLKA,B ckbuf1,4 module CLK
4Q: CLKA,B ckbuf3,9 board 2.5V clock

--0: 2F0
--1: 2F1
--2: 3F0
--3: 3F1
--4: 4F0
--5: 4F1
--6: FS

--FS  LOW, all others ALL MID?  0ZZZZZZ 

-- B1 PLL generates PCLKA/B,(sysclk) and MCLOCK (clock for buffers on board)
-- Not clear what should be done
-- reference comes from pll0-2q
-- FB <- 3q
1Q:PCLKA,B ckbuf5,6 module sysclock
2Q:MCLKA,B ckbuf7,8 module bclock (buffer clock)
3Q:FB

--0: 2F0
--1: 2F1
--2: 3F0
--3: 3F1
--4: FS
-- 991 mode select table
-- F1 F0      1Q,2Q        3Q       4Q    NUMBER
-- L L         -4         Div2     Div2   0000 = 0
-- L M         -3          -6       -6    0001 = 1 
-- L H         -2          -4       -4    0010 = 2
-- M L         -1          -2       -2    0100 = 4
-- M M          0           0        0    0101 = 5
-- M H          1           2        2    0110 = 6
-- H L          2           4        4    1000 = 8
-- H M          2           4        4    1001 = 9
-- H H          4         Div4     invert 1010 = A

TEST1
  
  1420, 154 = 01 01 00 00 10 00 00   01 01 01 01 00
  B0PLL = ZZ00100 B1PLL=ZZZZ0

  B0 2Q = PLL1 (REF) 00 = -4
  B0 3Q = G6MO       01 = -4
  B1 2Q = MCLOCK?    Z0 = -1
  B1 3Q = FB         ZZ = 0

TEST2
  
  1220, 154 = 01 00 10 00 10 00 00   01 01 01 01 00
  B0PLL = Z010100 B1PLL=ZZZZ0

  B0 2Q = PLL1 (REF) 00 = -4
  B0 3Q = G6MO       01 = -4
  B0 4Q = 2.5V       01 = -4
  B1 2Q = MCLOCK?    Z0 = -1
  B1 3Q = FB         ZZ = 0

TEST3
  
  B0: 1220, B1: 152
  B0PLL = Z010100 B1PLL=0ZZZ0

  B0 2Q = PLL1 (REF)  -4
  B0 3Q = G6MO        -4
  B0 4Q = 2.5V        -4
  B1 2Q = MCLOCK?     -2
  B1 3Q = FB           0

TEST4
  
  1220, 155 = 01 00 10 00 10 00 00  01 01 01 00 10
  B0PLL = 1 01 01 00 B1PLL=ZZZZZ

  B0 2Q = PLL1 (REF) 00 = -4
  B0 3Q = G6MO       01 = -4
  B0 4Q = 2.5V       01 = -4
  B1 2Q = MCLOCK?    ZZ =  0
  B1 3Q = FB         ZZ = 0

  

Fairchild
  
  20, 51 = 0 10 00 00   01 01 00 01
  B0PLL = 0100 B1PLL=ZZ0Z

  B0 2Q = PLL1 (REF) 00 = -4
  B0 3Q = G6MO       01 = -4
  B1 2Q = MCLOCK?    0Z = -3
  B1 3Q = FB         ZZ = 0

Toshiba
  
  20, 52 = 0 10 00 00   01 01 00 10
  B0PLL = 0100 B1PLL=ZZ01

  B0 2Q = PLL1 (REF) 00 = -4
  B0 3Q = G6MO       01 = -4
  B1 2Q = MCLOCK?    01 = -2
  B1 3Q = FB         ZZ = 0

  TI
  
  20, 55 = 0 10 00 00   01 01 01 01
  B0PLL = 0100 B1PLL=ZZZZ

  B0 2Q = PLL1 (REF) 00 = -4
  B0 3Q = G6MO       01 = -4
  B1 2Q = MCLOCK?    ZZ = 0
  B1 3Q = FB         ZZ = 0

  FC-new
  
  20, 54 = 0 10 00 00   01 01 01 00
  B0PLL = 0100 B1PLL=ZZ01

  B0 2Q = PLL1 (REF) 00 = -4
  B0 3Q = G6MO       01 = -4
  B1 2Q = MCLOCK?    Z0 = -1
  B1 3Q = FB         ZZ = 0
*/

void  check_and_set_mbuftype(char * mbufname,
			     int mbufok,
			     int * mbuftype)
{
    int i;
    if (mbufok == 0){
	fprintf(stderr,"Using default MBUFTYPE = %s\n",mbufnames[*mbuftype]);
	return;
    }
    for(i=0;i<NMBUFTYPES;i++){
	if (strcmp(mbufname, mbufnames[i]) == 0){
	    *mbuftype = i;
	    fprintf(stderr,"Using MBUFTYPE = %s\n",mbufnames[i]);
	    return;
	}
    }
    fprintf(stderr,"MBUFTYPE = %s is not supported, exiting...\n",mbufname);
    exit(-1);
}

static void setup_pllmode(int boardid,
			   int mode)
{
    int i;
    for (i=0;i<2;i++){
	setpllmode(boardid,i,mbufmodedata[mode][i]);
    }
}
/* sleep: drive the module chip clock by half freq.
 * change 3q0, 3q1 of PLL0 to low, low
 * that is, pllmode 3&2
 The code to set pllmodes is:
   b0pllmode(2) <= '0' when b0pllreg(5 downto 4) = "00"
                else 'Z' when b0pllreg(5 downto 4) = "01"
                else '1';
   b0pllmode(3) <= '0' when b0pllreg(7 downto 6) = "00"
                else 'Z' when b0pllreg(7 downto 6) = "01"
                else '1';
 Therefore, one should mask b0pllreg(7:4) by '0000'

 */
static void setup_pllmode_for_sleep(int boardid,
			   int mode)
{
    int i;
    for (i=0;i<2;i++){
	int data_to_send = mbufmodedata[mode][i];
	if (i == 0)data_to_send &= 0xffffff0f;
	setpllmode(boardid,i,data_to_send);
    }
}

void g6_change_clock(int clusterid, int mode)
     /* mode:0 for sleep, 1 for activate */
{
    GRAPE6_CLUSTER_PTR cp = clusters[clusterid];
    int boardid = cp->ipboardid;
    int buftype = cp->mbuftype;

    if (mode){
	setup_pllmode(boardid,buftype);
	uwait(2000);
    }else{
	setup_pllmode_for_sleep(boardid,buftype);
    }
}

int read_config_file(int config_id,
		     GRAPE6_CLUSTER_PTR*  cluster)
{
    /* return code : 0---second call, do nothing
       1---first call success
       -1---first call failed
       */
    FILE * fin;
    char * file_name;
    char line[MAXLINEFORCONFIG];
    char keyword[MAXLINEFORCONFIG];
    char foconf[MAXLINEFORCONFIG];
    int idefect;
    int ipok = 0;
    int fook = 0;
    int mcok = 0;
    int fclinkok = 0;
    int fcok = 0;
    int modfcok = 0;
    int cbfcoka = 0;
    int cbfcokb = 0;
    int cbfcokc = 0;
    int mbufok = 0;  
    GRAPE6_CLUSTER_PTR cp;
    
    int i,j;
    if ((*cluster != NULL) && ((*cluster)->open_state != 0)) return 0;

    if (*cluster == NULL){
	*cluster = malloc(sizeof(GRAPE6_CLUSTER));
	if (*cluster == NULL){
	    fprintf(stderr,"(read_config_file) internal error: malloc failed\n");
	    return -1;
	}
	(*cluster)->fofpga_state = 0;
	(*cluster)->jp_buffer_size = -1;
    }
    cp = *cluster;

    file_name = get_full_config_file_name(config_id);
    if(file_name == NULL) return -1;
#ifdef INTERNAL_OUT
    fprintf(stderr,"(read_config_file) file to open: %s\n", file_name);
#endif
    fin = fopen(file_name,"r");
    if(fin == NULL){
	fprintf(stderr,"(read_cofig_file) failed to open config file %s\n",
		file_name);
	return -1;
    }
    idefect = 0;
    cp->system_version = 100; /* default for file without version number */
    cp->mbuftype = 0;
    cp->jpspace = 0;
    cp->ijpdma = 0;
    cp-> time_soft_limit = 100000000; /* default: practically unlimited */
    cp-> time_hard_limit = 100000000;
    while (fgets(line, MAXLINEFORCONFIG-5,fin) != NULL){
	/* reading config file */
	if (line[0] != '#'){
	    /* first char '#' is a comment */
	    /* first process the keyword part */
	    sscanf(line,"%s", keyword);
	    if (strcmp(keyword, "IPLINK") == 0){
		sscanf(line,"%s%d", keyword, &(cp->ipboardid));
		ipok = 1;
	    }else if (strcmp(keyword, "FOLINK") == 0){
		sscanf(line,"%s%d", keyword, &(cp->foboardid));
		fook = 1;
	    }else if (strcmp(keyword, "MAXCHIP") == 0){
		sscanf(line,"%s%d", keyword, &(cp->maxchips));
		mcok = 1;
	    }else if (strcmp(keyword, "DEFECT") == 0){
		sscanf(line,"%s%d", keyword, &(cp->defects[idefect]));
		idefect ++;
	    }else if (strcmp(keyword, "SYSVERSION") == 0){
		sscanf(line,"%s%d", keyword, &(cp->system_version));
	    }else if (strcmp(keyword, "FOCONF") == 0){
		int ichip;
		sscanf(line,"%s %s", keyword, cp->foconf);
		fcok = 1;
		for(ichip = 0; ichip <NFOCHIPSONPB;ichip++) strcpy(cp->foconfsub[ichip], cp->foconf);
	    }else if (strcmp(keyword, "MODFOCONF") == 0){
		sscanf(line,"%s %s", keyword, cp->modfoconf);
		modfcok = 1;
	    }else if (strcmp(keyword, "FOLINKCONF") == 0){
		sscanf(line,"%s %s", keyword, cp->folinkconf);
		fclinkok = 1;
	    }else if (strcmp(keyword, "CBFOCONFA") == 0){
		sscanf(line,"%s %s", keyword, cp->cbfoconfa);
		cbfcoka = 1;
	    }else if (strcmp(keyword, "CBFOCONFB") == 0){
		sscanf(line,"%s %s", keyword, cp->cbfoconfb);
		cbfcokb = 1;
	    }else if (strcmp(keyword, "CBFOCONFC") == 0){
		sscanf(line,"%s %s", keyword, cp->cbfoconfc);
		cbfcokc = 1;
	    }else if (strcmp(keyword, "MBUFTYPE") == 0){
		sscanf(line,"%s %s", keyword, cp->mbufname);
		mbufok = 1;
	    }else if (strcmp(keyword, "JPSPACE") == 0){
		sscanf(line,"%s %d", keyword, &(cp->jpspace));
	    }else if (strcmp(keyword, "IJPDMA") == 0){
		sscanf(line,"%s %d", keyword, &(cp->ijpdma));
	    }else if (strcmp(keyword, "TIMESOFTLIMIT") == 0){
		sscanf(line,"%s %d", keyword, &(cp->time_soft_limit));
	    }else if (strcmp(keyword, "TIMEHARDLIMIT") == 0){
		sscanf(line,"%s %d", keyword, &(cp->time_hard_limit));
	    }else{
		/* first check for optional FOCONF specifications */
		int ichip;
		int isused = 0;
		char keycompare[12];
		for(ichip = 0;ichip< NFOCHIPSONPB; ichip++){
		    sprintf(keycompare,"FOCONF%1d", ichip);
		    if (strcmp(keyword, keycompare) == 0){
			sscanf(line,"%s %s", keyword, cp->foconfsub[ichip]);
			isused = 1;
		    }
		}
		if (isused == 0){
		    fprintf(stderr,"read_config_file: unknown line: %s\n", line);
		}
	    }
	}
    }
    fclose(fin);
    check_and_set_mbuftype(cp->mbufname, mbufok,&(cp->mbuftype));

    if (cp->system_version < 200){
	/* for version 1 system, some words are not required */
	modfcok = 1;
	fclinkok = 1;
	mbufok = 1;
    }


    if (ipok*fook*mcok*fcok*modfcok*fclinkok*cbfcoka*cbfcokb*cbfcokc*mbufok == 0){
	fprintf(stderr,"ERROR read_config_file: parameters missing in %s\n",
		file_name);
	if (ipok == 0)fprintf(stderr,"    IPLINK\n");
	if (fook == 0)fprintf(stderr,"    FOLINK\n");
	if (mcok == 0)fprintf(stderr,"    MAXCHIP\n");
	if (mcok == 0)fprintf(stderr,"    MAXCHIP\n");
	if (fcok == 0)fprintf(stderr,"    FOCONF\n");
	if (mbufok == 0)fprintf(stderr,"    MBUFTYPE\n");
	if (modfcok == 0)fprintf(stderr,"    MODFOCONF\n");
	if (cbfcoka == 0)fprintf(stderr,"    CBFOCONFA\n");
	if (cbfcokb == 0)fprintf(stderr,"    CBFOCONFB\n");
	if (cbfcokc == 0)fprintf(stderr,"    CBFOCONFC\n");
	return -1;
    }
    if (cp->ipboardid == cp->foboardid ){
	cp->singleinterface = 1;
    }else{
	cp->singleinterface = 0;
    }
    cp->ndefects = idefect;
    for(i=0;i<cp->ndefects-1; i++){
	if (cp->defects[i] >=cp->defects[i+1]){
	    fprintf(stderr,"ERROR read_config_file: DEFECTS should be ascending order\n");
	    fprintf(stderr,"Violation in locations %d %d, values %d %d\n",i, i+1,
		    cp->defects[i],cp->defects[i+1]);
	    fprintf(stderr,"You might think sorting is easy, well, then, you can do it for me.\n");
	    return -1;
	}
    }
    
    cp->nchips = cp->maxchips-cp->ndefects;
    fprintf(stderr,"(read_config_file) MAXCHIP, NCHIP = %d %d, CHIPS:\n",
	    cp->maxchips, cp->nchips);

    for(i = j = 0; i<cp->maxchips; i++){
	if((i == cp->defects[j]) && (j < cp->ndefects)){
	    j++;
	}else{
	    cp->realchips[i-j] = i;
	}
    }
    for(i=0;i<cp->nchips; i++){
	fprintf(stderr," %3d", cp->realchips[i]);
	if (i % 16 == 15) fprintf(stderr,"\n");
    }
    if (cp->nchips % 16 != 0)fprintf(stderr,"\n");
    dump_cluster_struct(cp);
    
#ifdef INTERNAL_OUT0
    dump_cluster_struct(*cluster);
#endif
    return 1;
}

static double tmax_g6;
static ULONG toffset_g6;
static double tscaledmax;
#define IEEEDOUBLEMANTISSA 52
#define TIMEOFFSET 10
#define TIMESHIFT (IEEEDOUBLEMANTISSA+TIMEOFFSET)


static double vscale;

static set_vscale()
{
    vscale = ldexp(1.0,(int)(xunit-tunit));
}

void g6_set_tunit_(int * newtunit)
{
  tunit = *newtunit;
  dtmin_g6 = 1.0/(((ULONG)1)<<tunit);
  frexp(dtmin_g6, &dtminexp_g6);
  tmax_g6 = ldexp(1.0,TIMESHIFT-tunit);
  toffset_g6 = ((ULONG)1)<<52;
  tscaledmax = toffset_g6;
  set_vscale();
}
void g6_set_tunit(int  newtunit)
{
    g6_set_tunit_(&newtunit);

}

void g6_set_xunit_(int * newxunit)
{
  xunit = *newxunit;
  xscale = ((ULONG)1)<<((int)xunit);
  xscaleinv = 1.0/xscale;
  xscale2 = xscale*xscale;
  set_vscale();
}

void g6_set_xunit(int newxunit)
{
    g6_set_xunit_(&newxunit);
}

static void initialize_library_parameters()
{
  int i,tunit,xunit,itmp;
  unsigned int ipdata[200];
  
  tunit = DEFAULT_TUNIT;                        
  g6_set_tunit_(&tunit);
  xunit = DEFAULT_XUNIT;          
  g6_set_xunit_(&xunit);
}


int ipdatawrite(int boardid, unsigned int * buf)
{
    int i;
#ifdef INTERNAL_OUT0
    for(i=0;i<buf[1]+2;i++){
	fprintf(stderr,"IPWRITE %5d %8x\n", i, buf[i]);
    }
#endif
    MB;
    return linkwrite(boardid,IPLINK,buf[1]+2, buf);
}
int ipdatawrite_dma(int clusterid, unsigned int * buf)
{
#if REALHARD
    linkwrite_dma(clusterid,IPLINK,buf[1]+2, buf);
    uwait((buf[1]+2)/25);
    wait_dma_to_finish(clusters[clusterid]->ipboardid);
#endif
    return 0;
}

int fodatawait(int boardid, int nwords)
{
    int ndata, ncall, error;
    ncall = 0;
    do{
        ndata = g6hib_getcounter_local(boardid);
#ifdef INTERNAL_OUT0
#define WAITMAX 100	
	fprintf(stderr,"(fodataread) ndata = %d\n",ndata);
#else
#define WAITMAX 10000
#endif	
	ncall ++;
	if (ncall > WAITMAX){
	    fprintf(stderr,"(fodataread) TOO LONG WAIT, req, ret = %d %d ... maybe hardware problem\n",
		    nwords, ndata);
	    return -1;
	}
	if (ndata == 0) uwait(1); 
    }while (ndata < nwords);
    error = 0;
#if 0    
    if (ndata > nwords){
	int i;
	fprintf(stderr,"(fodataread) TOO MUCH DATA, req, ret = %d %d ... maybe hardware problem\n",
		nwords, ndata);
	error = 1;
    }
#endif    
#ifdef INTERNAL_OUT0
    fprintf(stderr,"(fodataread) ndata = %d\n",ndata);
#endif
    return error;
}

int fodataread(int boardid, int nwords, unsigned int * buf)
{
    int ndata, ncall, error;
    ncall = 0;
    error = linkread(boardid,nwords, buf);
    if (error){
	int i;
	fprintf(stderr,"(fodataread) dumping the received data\n");
	for(i=0;i<nwords; i++){
	    fprintf(stderr,"data[%3d]= %x\n", i, buf[i]);
	}
    }
    return error;
}

static void check_defect_list_for_mc(GRAPE6_CLUSTER_PTR cp,
				     int mccount)
{
    int i;
    int ndefperboard[MAXBOARDSPERCLUSTER];
    int maxchipsperboard = cp->maxchips /mccount;
    cp->nchipsperboard = cp->nchips /mccount; 
    cp->mccount = mccount;
    /* first check defects count */
    if (cp->ndefects % mccount){
	fprintf(stderr,"Defect count %d not multiple of mccount %d\n",
		cp->ndefects, mccount);
	exit(-1);
    }
    /* then count defects per board */
    for(i=0;i<mccount;i++)ndefperboard[i] = 0;
    for(i=0;i<cp->ndefects;i++){
	int iboard = cp->defects[i]/maxchipsperboard;
	ndefperboard[iboard]++;
    }
    /* defects per board should be the same for all boards */
    for(i=1;i<mccount;i++){
	if(ndefperboard[i] != ndefperboard[0]){
	    fprintf(stderr,
		    "Defect count %d for board %d not equal to  %d(board 0)\n",
		    ndefperboard[i],i,ndefperboard[0]);
	    exit(-1);
	}
    }
}

void g6_reset_chip_vcids(int clusterid)
{
    GRAPE6_CLUSTER_PTR cp;
    unsigned int ipdata[4];
    int board, i;
    cp = clusters[clusterid];
    board = cp->ipboardid;
    /* now comes the hard part... setting VCIDs */
    ipdata[0] = JPREG_VCID_ADDRESS;
    ipdata[1] = 0x1;
    for (i=0;i <cp->maxchips; i++){
	int ivc = i ;
	ipdata[2] = (i<<10) | ivc;
	ipdatawrite(board,ipdata);
    }

    /* Now, we should reset inactive bits for all */
    ipdata[0] = FOREG_INACTIVE_ADDRESS;
    ipdata[1] = 0x1;
    for (i=0; i <cp->maxchips; i++){
	/* 20-11 : mask, 10-1 : virtual chip id, LSB: inactive flag */
	ipdata[2] =  (0x3ff<<11)|(i<<1); 
	ipdatawrite(board,ipdata);
    }
    
}

/*
 * initialize_g6chips
 *
 * setup VCID and other stuff for each chip under cluster
 *
 * if mccmode, valid VCIDs are 0 ... nchips/mccount
 */
static void initialize_g6chips(GRAPE6_CLUSTER_PTR cp)
{
    unsigned int ipdata[256];
    int board, i;
    board = cp->ipboardid;
    ipdata[0] = JPREG_BASE_ADDRESS;                  /* JPRG */
    ipdata[1] = 0x11;
    for(i=0;i<0x10;i++) ipdata[i+2] = i; /* address conversion table */
    ipdata[18] = 0x10;                  /* ND */
    ipdatawrite(board, ipdata);
    
    ipdata[0] = JPREG_MC_ADDRESS;
    ipdata[1] = 0x4;
    ipdata[2] = JP_ADLY;                   /* ADLY */
    ipdata[3] = JP_WDLY;                   /* WDLY */
    ipdata[4] = JP_ODLY;                   /* ODLY */
    ipdata[5] = JP_DDLY;                   /* DDLY */
    ipdatawrite(board,  ipdata);

    /* now comes the hard part... setting VCIDs */
    ipdata[0] = JPREG_VCID_ADDRESS;
    ipdata[1] = 0x1;
    /* first set them for non-defective chips */
    for (i=0;i <cp->nchips; i++){
	/* ivc: VCID, realchips[i]: physical ID */
	int ivc = i % cp->nchipsperboard;
#if 0
	if (i != ivc){
	    fprintf(stderr,"Set VCID different from consective, %d %d %d\n",
		    i, ivc, cp->mccount);
	}
#endif
	ipdata[2] = ((cp->realchips[i])<<10) | ivc;
	ipdatawrite(board,ipdata);
    }
    /* then set them for defective chips --- defective chips are give large VCIDs */
    for (i=0;i <cp->ndefects; i++){
	/* i: VCID, realchips[i]: physical ID */
	//	ipdata[2] = ((cp->defects[i])<<10) | (i+cp->nchips);
	ipdata[2] = ((cp->defects[i])<<10) | (i+cp->maxchips);
	// test for neighbour read.
	ipdatawrite(board,ipdata);
    }
    
    
    ipdata[0] = IPREG_BASE_ADDRESS;                  /* IPRG */
    ipdata[1] = 0xd;
    for(i=0;i<0xd;i++) ipdata[i+2] = i; /* address conversion table */
    ipdatawrite(board,ipdata);
    
    ipdata[0] = IPREG_ND_ADDRESS;                  /* IPRG */
    ipdata[1] = 0x2;
    ipdata[2] = 0xd;                    /* ND */
    ipdata[3] = 0x0;                    /* testmode */
    ipdatawrite(board,ipdata);
    
    ipdata[0] = FOREG_BASE_ADDRESS;                 /* FORG */
    ipdata[1] = 0x10;
    ipdata[2] = 1;                      /* address conversion table      */ 
    ipdata[3] = 0;                      /* Note that LSB 32bit-word      */ 
    ipdata[4] = 3;                      /* has LOWER address -- swapping */ 
    ipdata[5] = 2;                      /* is necessary                  */ 
    ipdata[6] = 5;
    ipdata[7] = 4;
    ipdata[8] = 7;
    ipdata[9] = 6;
    for(i=0x8;i<0xf;i++) ipdata[i+2] = i; 
#if 0
    ipdata[15] = 12;
    ipdata[14] = 13;
#endif
    ipdatawrite(board,ipdata);
    
    ipdata[0] = FOREG_ND_ADDRESS;                 /* FORG */
    ipdata[1] = 0x2;                    
    ipdata[2] = 0xe;                    /* ND */
    ipdata[3] = 0x0;                    /* INACTIVE -- initialize to 0 */
    ipdatawrite(board,ipdata);
    
    /* Now, we should set inactive bits for defective chips */
    ipdata[0] = FOREG_INACTIVE_ADDRESS;
    ipdata[1] = 0x1;
    for (i=0;i <cp->ndefects; i++){
	/* 20-11 : mask, 10-1 : virtual chip id, LSB: inactive flag */
	ipdata[2] =  (0x3ff<<11)|((i+cp->maxchips)<<1)|1; 
	ipdatawrite(board,ipdata);
    }
    
  

  ipdata[0] = CALC_BASE_ADDRESS;                 /* CARG */
  ipdata[1] = 0x2;                    
  ipdata[2] = CALC_LRAM;                    /* LRAM */
  ipdata[3] = CALC_LFORCE;                   /* LFORCE */
  ipdatawrite(board,ipdata);

  ipdata[0] = CUTOFF_BASE_ADDRESS;                  /* set cutoff table (no cutoff) */
  ipdata[1] = 0x80;
  for(i=0;i<0x80;i++) ipdata[i+2] = 0x2000001;
  ipdatawrite(board,ipdata);

  ipdata[0] = CUTOFF_RSCALE_ADDRESS;                  /* set rscale for cutoff table */
  ipdata[1] = 0x1;
  ipdata[2] = global_rscale;
  ipdatawrite(board,ipdata);

}

void g6_initialize_chips(int clusterid)
{
    initialize_g6chips(clusters[clusterid]);
}



void set_simulator_use(GRAPE6_CLUSTER_PTR cp,
		       int mode)
{
#if SIMULATOR
    cp->simg6p->use_simulator = mode;
#endif
}

void g6_set_simulator_use_(int * clusterid, int * mode)
{
    set_simulator_use(clusters[*clusterid], * mode);
}
void g6_set_simulator_use(int clusterid, int mode)
{
    set_simulator_use(clusters[clusterid], mode);
}

int send_config_data_to_fpga_version_1(GRAPE6_CLUSTER_PTR cp,
				       unsigned int boardid,
				       unsigned int link)
{
    int iret = 0;
    iret += send_fpga_data(boardid, link, 0x422, 1,cp->cbfoconfa);
    iret += send_fpga_data(boardid, link, 0x422, 2,cp->cbfoconfb);
    iret += send_fpga_data(boardid, link, 0x422, 4,cp->cbfoconfb);
    iret += send_fpga_data(boardid, link, 0x422, 8,cp->cbfoconfc);
#ifndef USE_SPECIAL_CONFIGULATION    
    /* the following send same configuration file to all of the five FPGAs*/
    iret += send_fpga_data(boardid, link, 0x420, 1, cp->foconf);
    iret += send_fpga_data(boardid, link, 0x420, 2, cp->foconf);
    iret += send_fpga_data(boardid, link, 0x420, 4, cp->foconf);
    iret += send_fpga_data(boardid, link, 0x420, 8, cp->foconf);
    iret += send_fpga_data(boardid, link, 0x420, 16,cp->foconf);
#else
    
    fprintf(stderr,"(g6open) use special conf for the first board.... \n");
#if 0 /* full 12 chip mode of the FIRST board */
    iret += send_fpga_data(boardid, link, 0x420, 1,
			   "/usr2/makino/src/grape6board/processor_board/fo_unit_with_fifo.ttf");
    iret += send_fpga_data(boardid, link, 0x420, 28,
			   "/usr2/makino/src/grape6board/processor_board/fo_unit_with_fifo_rh.ttf");
    iret += send_fpga_data(boardid, link, 0x420, 2,
			   "/usr2/makino/src/grape6board/processor_board/fo_unit_dum.ttf");
#endif
#if 1    
    iret += send_fpga_data(boardid, link, 0x420, 1,
			   "/usr2/makino/src/grape6board/processor_board/fo_unit_thru2-3.ttf");
    iret += send_fpga_data(boardid, link, 0x420, 2,
			   "/usr2/makino/src/grape6board/processor_board/fo_unit_dum.ttf");
    iret += send_fpga_data(boardid, link, 0x420, 4,
			   "/usr2/makino/src/grape6board/processor_board/fo_unit_dum.ttf");
    iret += send_fpga_data(boardid, link, 0x420, 8,
			   "/usr2/makino/src/grape6board/processor_board/fo_unit_dum.ttf");
    iret += send_fpga_data(boardid, link, 0x420, 16,
			   "/usr2/makino/src/grape6board/processor_board/fo_unit_with_fifo_rh.ttf");
#endif
#endif
    return iret;
}

int send_config_data_to_fpga_version_2(GRAPE6_CLUSTER_PTR cp,
				       unsigned int boardid,
				       unsigned int link)
{
    int iret = 0;
    iret += send_fpga_data(boardid, link, 0x422, 1,cp->cbfoconfa);
    iret += send_fpga_data(boardid, link, 0x422, 2,cp->cbfoconfb);
    iret += send_fpga_data(boardid, link, 0x422, 4,cp->cbfoconfb);
    iret += send_fpga_data(boardid, link, 0x422, 8,cp->cbfoconfc);

#ifndef CBTESTMAIN    
    /* the following send configuration files to all of the seven FPGAs
       on PB*/
    iret += send_fpga_data(boardid, link, 0x420, 0x1,  cp->foconfsub[0]);
    iret += send_fpga_data(boardid, link, 0x420, 0x2,  cp->foconfsub[1]);
    iret += send_fpga_data(boardid, link, 0x420, 0x4,  cp->foconfsub[2]);
    iret += send_fpga_data(boardid, link, 0x420, 0x10, cp->foconfsub[3]);
    iret += send_fpga_data(boardid, link, 0x420, 0x20, cp->foconfsub[4]);
    iret += send_fpga_data(boardid, link, 0x420, 0x40, cp->foconfsub[5]);
    iret += send_fpga_data(boardid, link, 0x420, 0x80, cp->foconfsub[6]);

    /* send data for on-module FPGA */
    iret += send_fpga_data(boardid, link, 0x420, 0x100, cp->modfoconf);
    iret += send_fpga_data(boardid, link, 0x420, 0x200, cp->modfoconf);

    /* send data for folink */
    iret += send_fpga_data(boardid, link, 0x420, 0x400, cp->folinkconf);
#endif
    return iret;
}

int g6open(int clusterid)
{
    unsigned int link,data, status, boardid;
    int iret;
    unsigned int pagesize;
    unsigned long pagemask, dma_address0;
    int pageoffset;
    GRAPE6_CLUSTER_PTR cp;
    int first_call = 0;
    static int linux_version;
    int waittime;

    
    if (clusters[clusterid] == NULL){
	/* config file not read yet */
	first_call = 1;
	linux_version = getlinuxversiontype();
	iret = read_config_file(clusterid, clusters+clusterid);
	if (iret == 0){
	    /* cluster is already open and available. returns 0 */
	    return 0;
	}
	if (iret < 0){
	    /* negative return codes implies some error */
	    fprintf(stderr,"(g6open) read_config_file returns %d, exiting...\n",
		    iret);
	    exit(-1);
	}
	calibrate_dummyloop();
    }else if (clusters[clusterid]->open_state){
	/* cluster is already open */
	return 0;
    }
    cp = clusters[clusterid];
    cp->mccount = mccount;
    boardid = cp->ipboardid;
    link = IPLINK;
#ifdef INTERNAL_OUT
    fprintf(stderr,"(g6open) Output from board %d Link %d\n", boardid,link);
#endif

    if (first_call){
	cp->cputime_for_last_call = 0;
    }

    waittime = -1;
    if (cp->cputime_for_last_call > cp->time_soft_limit){
	waittime = cp->cputime_for_last_call - cp->time_soft_limit;
    }
    iret = linkopen(boardid,waittime);
    if (iret != 0){
	fprintf(stderr,"(g6open) linkopen for boardid = %d failed\n", boardid);
	return -1;
    }

    if (cp->singleinterface != 1){
	iret = linkopen(cp->foboardid,0);
	if (iret != 0){
	    fprintf(stderr,"(g6open) linkopen for boardid = %d failed\n", cp->foboardid);
	    return -1;
	}
#if 0	 
	fprintf(stderr,"(g6open) separate interfaces for IP/JP and FO is not supported yet... Sorry\n");
	exit(-1);
#endif
    }
	
    /*
     * Here follows the low-level initialization of the board
     */
    g6_change_clock(clusterid, 1);
    uwait(10);
    g6reset_pbonly(boardid);
    setjpspace(boardid,cp->system_version,18,18+cp->jpspace); 

    if (cp->fofpga_state == 0){
	g6errorcounter_clear(clusterid);
	g6reset(boardid);
	g6reset_pbonly(boardid);
	uwait(100);
	g6_change_cbjpmode(clusterid,0);
	uwait(100);
	g6errorcounter_clear(clusterid);
	uwait(100);
	g6reset(boardid);
	uwait(100);
	g6reset_pbonly(boardid);
	uwait(100);

	/* set PLL mode */
	setup_pllmode(boardid,cp->mbuftype);
	uwait(100);


	g6setledmode(boardid, PBLED_PROGRAM_MODE);
	g6_led_test(boardid); /* Showing off LEDs ... */

	/*	test_data(boardid, link);*/

	/* Now write FOUNIT configulation files */

	if(cp->system_version <200){
	    iret = send_config_data_to_fpga_version_1(cp,boardid,link);
	}else if (cp->system_version <300){
	    iret = send_config_data_to_fpga_version_2(cp,boardid,link);
	}else{
	    fprintf(stderr,"System version=%d > 299 unsupported\n",
		    cp->system_version);
	    exit(-1);
	}

	if (iret){
	    fprintf(stderr,"(g6open) failed to setup FOUNIT configulation -- exiting\n");
	    exit(-1);
	}
	cp->fofpga_state = 1;
	g6reset_pbonly(boardid);
    }
    g6setledmode(boardid, PBLED_SPEEDOMETER_MODE);
    initialize_library_parameters();
    check_defect_list_for_mc(cp, mccount);
    initialize_g6chips(cp);
    g6hib_foerror_clear(cp->foboardid);
    cp->open_state = 1;
    cp->fofferror_check_mode  = 0;
    cp->jp_buffered_mode  = 0;
    cp->nj = -1; /* initialize nj to negative so that at first call to set_njp
		    adjustment of unused location is always invoked */
    global_rscale = 0x0073c800; /* temporary */

    /* allocate DMA work area */
#if 0 /*Old style */
#ifdef DMA
#ifndef NEWPHIB
    {
	int i;
	for(i=0;i < DMA_MAX_SIZE*3; i++){
	    dma_real_work_area[clusterid][i] = i;
	}
    }
    dma_address0 =TBdmaMapLoad(boardid,(caddr_t)
				(dma_real_work_area[clusterid]+DMA_MAX_SIZE),
				DMA_MAX_SIZE);
    pagesize = getpagesize();
    pagemask = pagesize - 1;
    pagemask = ~pagemask;
    cp->dma_offset= dma_address0 & pagemask;
    pageoffset = ((int) dma_address0) - cp->dma_offset;
    /*    fprintf(stderr,"get_page_size = %x %lx %x\n", getpagesize(), pagemask, pageoffset);
    fprintf(stderr,"(g6open) DMA MAP = %lx\n", cp->dma_offset);*/
    dma_work_area[clusterid] = dma_real_work_area[clusterid]+DMA_MAX_SIZE-pageoffset/sizeof(int);
    
    /*    fprintf(stderr,"(g6open) DMA buffers = %lx %lx\n",
	    (long) (dma_real_work_area[clusterid]+DMA_MAX_SIZE),
	    (long) (dma_work_area[clusterid]));*/
    
    
#else
    fprintf(stderr,"calling TBgetDmaInfo\n");

    
    TBgetDmaInfo(boardid, &(cp->dma_offset), &(dma_work_area[clusterid]));
    fprintf(stderr,"(g6open) DMA MAP = %x %x\n", cp->dma_offset, (int) dma_work_area[clusterid]);
    dma_work_area[clusterid][0] = 0x12345678;
    fprintf(stderr,"DMA first loc =%x\n", dma_work_area[clusterid][0]);
#endif
    sethibdmamode(boardid);
#endif
#endif /* Old style */
    /* new style
       basic order:
       allocate memory if
              1) NON-DMA
	      2) if DMA, Linux version 2.2.x
       else if version 2.4
              use ioctl
	      */
    if (first_call){
#ifndef DMA
	dma_work_area[clusterid] = 	dma_real_work_area[clusterid]; 
#endif	
#ifdef DMA
	if (linux_version <= 0){
	    int i;
	    for(i=0;i < DMA_MAX_SIZE*3; i++){
		dma_real_work_area[clusterid][i] = i;
	    }
	    dma_address0 =TBdmaMapLoad(boardid,(caddr_t)
				       (dma_real_work_area[clusterid]
					+DMA_MAX_SIZE),
				       DMA_MAX_SIZE);
	    pagesize = getpagesize();
	    pagemask = pagesize - 1;
	    pagemask = ~pagemask;
	    cp->dma_offset= dma_address0 & pagemask;
	    pageoffset = ((int) dma_address0) - cp->dma_offset;
	    dma_work_area[clusterid] = dma_real_work_area[clusterid]+DMA_MAX_SIZE-pageoffset/sizeof(int);
	    
	    
	}else if((linux_version == 1)||(linux_version == 2)){
	    if(linux_version == 2){
		fprintf(stderr,"Linux version 2.6. Hope 2.4 interface works...\n");
	    }

	    fprintf(stderr,"calling TBgetDmaInfo\n");
	    TBgetDmaInfo(boardid, &(cp->dma_offset), &(dma_work_area[clusterid]));
	    fprintf(stderr,"(g6open) DMA MAP = %x %x\n", cp->dma_offset, (int) dma_work_area[clusterid]);
	    dma_work_area[clusterid][0] = 0x12345678;
	    fprintf(stderr,"DMA first loc =%x\n", dma_work_area[clusterid][0]);
	}else{
	    fprintf(stderr,"Unknown linux version\n");
	    return -1;
	}
	sethibdmamode(boardid);
#endif
    }
    cp->cputime_at_start = cpu_second();
#ifdef SIMULATOR
    cp->simg6p = malloc(sizeof(SIMGRAPE6_CLUSTER));
    if (cp->simg6p == NULL){
	fprintf(stderr,"(g6open) internal error: malloc failed for simulator\n");
	return -1;
    }
    if (initialize_simulator(cp->simg6p)){
	fprintf(stderr,"(g6open) internal error: simulator initialization failed\n");
	return -1;
    }
    set_simulator_use(cp,1);
#endif    
    
}



int g6_open_(int *clusterid)
{
    return g6open(*clusterid);
}
int g6_open(int clusterid)
{
    return g6open(clusterid);
}

int g6close(int clusterid)
{
    GRAPE6_CLUSTER_PTR  cluster = clusters[clusterid];
    if ((cluster == NULL) || (cluster->open_state == 0)) return 0;
    g6_change_clock(clusterid,0);
    cluster->open_state = 0;
    linkclose(cluster->ipboardid);
    if (cluster->singleinterface != 1){
	/* not implemented yet ... */
    }
    if(cluster->fofpga_state==0)g6errorcounter_dump(clusterid);
    cluster->cputime_for_last_call = cpu_second()-cluster->cputime_at_start;
    if (cluster->cputime_for_last_call> cluster->time_hard_limit*2){
	fprintf(stderr,"GRAPE 6 used for %f sec,TOO LONG. I QUIT.\n",
		cluster->cputime_for_last_call);
	exit(-1);
    }
    return 0;
}


int g6_close_(int *clusterid)
{
    return g6close(*clusterid);
}
int g6_close(int clusterid)
{
    return g6close(clusterid);
}


void g6_set_ti_(int *clusterid, double *ti)
{
  unsigned int ipdata[10];
  ULONG iti;
  ULONG iti2;
  double ti2;
#ifdef DEBUG  
  iti2 = (ULONG) ldexp(*ti, (int)tunit);
#endif
  ti2 = (*ti + tmax_g6);
  iti = (*((ULONG*)(&ti2)) & 0x000fffffffffffffL)<<TIMEOFFSET;
#ifdef DEBUG
  if (iti != iti2){
      fprintf(stderr,"(set_ti) ti, iti, iti2 = %g %Lx %Lx\n", *ti,iti, iti2);
  }
#endif
  ipdata[0] = TI_BASE_ADDRESS; 
  ipdata[1] = 0x2;
  ipdata[2] = (iti>>32);
  ipdata[3] = iti ;
#ifdef INTERNAL_OUT
  fprintf(stderr,"(g6_set_ti) ti = %g, %lx\n", ti, iti);
#endif  
  ipdatawrite(clusters[*clusterid]->ipboardid,ipdata);
#ifdef SIMULATOR
  sim_set_ti(iti);
#endif  
}

void g6_set_ti(int clusterid, double ti)
{
    g6_set_ti_(&clusterid, &ti);
}

LONG convert_predictor_time2_old(ULONG *  tjlsb, /* LSB of tj */
			    ULONG * dtjmsb, /* location of MSB of delta ti */
			    LONG * dtexp, /* exponent of dt in physical unit */
			    double rtj, /* particle time */
			    double rdtj) /* particle timestep */
		
{
  
  /* first, we calculate time values */
  register int idtexp;
  register ULONG t_int;
  DATAPACK pdtint,ptbiased;
  ULONG tjlsb2;
  double t_biased;
#ifdef INTERNAL_OUT0
  fprintf(stderr,"convert_p_t, tj, dtj = %e %e\n", rtj, rdtj);
#endif
  pdtint.d = rdtj;
  idtexp = ((pdtint.i64 >>52)&0x7ff) -0x3fe;
  if(pdtint.i64 & 0x000fffffffffffffL){
    fprintf(stderr, "convert_predictor: dt_int  = %lx != 0.5!\n", pdtint.i64);
    return -1;
  }
  *dtexp = 1-idtexp;
  *dtjmsb = idtexp - dtminexp_g6;
#ifdef DEBUG  
  t_int = rtj/rdtj;
  if (t_int * rdtj != rtj){
      fprintf(stderr, "tj not multiple of dtj %le %le %le\n",
	      rtj, rdtj, rtj/rdtj);
      return -1;
  }
  tjlsb2 = (ULONG) t_int & (ULONG) 1;
#endif
  if (fmod (rtj, rdtj) != 0.0){
            fprintf(stderr, "tj not multiple of dtj %le %le %le\n",
	      rtj, rdtj, rtj/rdtj);
      return -1;
  }
  ptbiased.d = rtj/rdtj + tscaledmax;
  *tjlsb =   ptbiased.i64 & (ULONG) 1;
#ifdef DEBUG
  if ((int)(*tjlsb) !=  (int)(tjlsb2)){
      fprintf(stderr,"%x %x\n", (int)(*tjlsb), (int)(tjlsb2));
  }
  fprintf(stderr,"convert_p_t, tj, dtj = %e %e %Lx %Lx %Lx\n", rtj, rdtj,
	  *tjlsb, *dtjmsb, *dtexp);
#endif
  return 0; 
}


LONG convert_predictor_time2(ULONG *  tjlsb, /* LSB of tj */
			    ULONG * dtjmsb, /* location of MSB of delta ti */
			    LONG * dtexp, /* exponent of dt in physical unit */
			    double rtj, /* particle time */
			    double rdtj) /* particle timestep */
		
{
  
  /* first, we calculate time values */
  register int idtexp;
  register ULONG t_int, dt_int;
  ULONG tjlsb2;
  double t_biased;
#ifdef INTERNAL_OUT0
  fprintf(stderr,"convert_p_t, tj, dtj = %e %e\n", rtj, rdtj);
#endif
  dt_int = *((ULONG*)(&rdtj));
  idtexp = ((dt_int >>52)&0x7ff) -0x3fe;
  if(dt_int & 0x000fffffffffffffL){
    fprintf(stderr, "convert_predictor: dt_int  = %lx != 0.5!\n", dt_int);
    return -1;
  }
  *dtexp = 1-idtexp;
  *dtjmsb = idtexp - dtminexp_g6;
#ifdef DEBUG  
  t_int = rtj/rdtj;
  if (t_int * rdtj != rtj){
      fprintf(stderr, "tj not multiple of dtj %le %le %le\n",
	      rtj, rdtj, rtj/rdtj);
      return -1;
  }
  tjlsb2 = (ULONG) t_int & (ULONG) 1;
#endif
  if (fmod (rtj, rdtj) != 0.0){
            fprintf(stderr, "tj not multiple of dtj %le %le %le\n",
	      rtj, rdtj, rtj/rdtj);
      return -1;
  }
  t_biased = rtj/rdtj + tscaledmax;
  *tjlsb =   *((ULONG*)(&t_biased)) & (ULONG) 1;
#ifdef DEBUG
  if ((int)(*tjlsb) !=  (int)(tjlsb2)){
      fprintf(stderr,"%x %x\n", (int)(*tjlsb), (int)(tjlsb2));
  }
  fprintf(stderr,"convert_p_t, tj, dtj = %e %e %Lx %Lx %Lx\n", rtj, rdtj,
	  *tjlsb, *dtjmsb, *dtexp);
#endif
  return 0; 
}

ULONG determine_predictor_exponent2(LONG * e0b,
				   LONG dtexp,
				   double x[5], /* position, vel ... */
				   ULONG xunit /* position resolution= 2**-xunit*/)
{
  
  /* since we use common exponent for a2-v, they should be
     calculated in the same way, except for the difference in the
     power of dt.
     
     For velocity, we have the exponent same as the original velocity,
     since velocity is already normalized and dt does not exceed one.
     In order to prevent possible overflow, mantissa of v should be
     downshifted (to use only 23 bits).
     */
  
    register int e0btmp, ek0,  e0bmax;
    register ULONG * ix = (ULONG *) x;
    register ULONG l;
    int i;
  
    e0bmax = -1000000000;
    for(i=1;i<5;i++){
	l = ix[i];
	/*      if (l & 0x7ff0000000000000L ){*/
	/* Here, I skip the test for zero, since zero would anyway treated as
	   very small exponent value, which is okay */
	ek0 =  (l>>52)&0x7ff;
	e0btmp = ek0 - dtexp*i + mkmintab[i];
	if ((e0bmax < e0btmp)) e0bmax = e0btmp;
	/*      }*/
    }
    *e0b = e0bmax-0x3fe;
#ifdef INTERNAL_OUT
    fprintf(stderr,"(determine) e0b = %lx\n", *e0b);
#endif    
    return 0;
}

ULONG convert_predictor_using_e0b2(ULONG ix[5],
				  double x[5], 
				  LONG e0b,
				  LONG dtexp);


ULONG convert_predictor_using_e0b2(ULONG ix[5], /* converted position, vel... */
				  double x[5], /* position, vel ... */
				  LONG e0b,
				  LONG dtexp)

{
    register int  ek0,  expv;
    register int i,k;
    register int msbloc, sign;
    register double scaled_pk;
    register ULONG l;
    register LONG ixsigned;
    ULONG l1, *lp, ix2;
    lp = (ULONG*) x;
    /* convert x and set exponent of v */
#ifdef X86    
    ixsigned = x[0]*xscale;
    ix[0] = (ULONG) ixsigned;
#else
    ix[0] = rint(x[0]*xscale);
#endif 
#ifdef INTERNAL_OUT
#ifdef X86
    fprintf(stderr, "ia32 x[0], xscale, result = %g %g %Lx\n", x[0], xscale, ix[0]);
#else
    fprintf(stderr, "alphax[0], xscale, result = %g %g %lx\n", x[0], xscale, ix[0]);
#endif
#endif    
    expv = e0b+xunit;
    if(expv > 0){
	for(i=1;i<5;i++){
	    l = lp[i];
	    ek0 = (l>>52)&0x7ff;
	    sign = l>>63;
	    msbloc = PRED_V_LEN +ek0-e0b-dtexp*i;
	    if (ek0 != 0) {
		l1 =  (((ULONG) msbloc)<<52) | (0xfffffffffffffL & l);
	    }else{
		l1 = 0;
	    }
	    scaled_pk = *((double*)(&l1));
#ifdef DEBUG
	    ix2 = (ULONG)(scaled_pk+0.5)| (sign <<lentab[i]);
#endif
 	    ix[i] = (unsigned int)(scaled_pk+0.5)| (sign <<lentab[i]);
#ifdef DEBUG
	    if (ix[i] != ix2){
		fprintf(stderr,"(cnvert ... ix, ix2 %Lx %Lx\n",
			ix[i], ix2);
	    }
#endif 
	}
	ix[1] |= ((ULONG)expv) << (lentab[1]+1);
    }else{
	ix[1] = ix[2] = ix[3]=ix[4] = 0;
    }
    return 0;
}

/* This is for x86 (high word in larger address, 32-bit int machine
 * This version does not work with gcc 2.96, 3.x with -O2 or higher...
 */ 
ULONG convert_predictor_using_e0b2x86_old(ULONG ix[5], /* converted position, vel... */
				  double x[5], /* position, vel ... */
				  LONG e0b,
				  LONG dtexp)

{
    int  ek0,  expv;
    int i,k;
    int msbloc, sign;
    double scaled_pk;
    ULONG l;
    unsigned int li;
    unsigned int *lip, *l1p;
    LONG ixsigned;
    ULONG l1;
    ULONG  *lp, ix2;
    lp = (ULONG*) x;
    lip = ((unsigned int *) x)+1;
    l1p=  ((unsigned int *) (&l1))+1;
    /* convert x and set exponent of v */
#ifdef X86    
    ixsigned = x[0]*xscale;
    ix[0] = (ULONG) ixsigned;
#else
    ix[0] = rint(x[0]*xscale);
#endif 
    expv = e0b+xunit;
#ifdef INTERNAL_OUT
#ifdef X86
    fprintf(stderr, "ia32 x[0], xscale, result = %g %g %Lx %x\n", x[0], xscale, ix[0],expv);
#else
    fprintf(stderr, "alphax[0], xscale, result = %g %g %lx\n", x[0], xscale, ix[0]);
#endif
#endif    
    if(expv > 0){
	for(i=1;i<5;i++){
	    li = lip[i+i];
	    l1 = lp[i];
	    ek0 = (li>>20)&0x7ff;
	    sign = li>>31;
	    msbloc = PRED_V_LEN +ek0-e0b-dtexp*i;
	    if (ek0 != 0) {
		*l1p =  (((ULONG) msbloc)<<20) | (0xfffffL & (*l1p));
	    }else{
		*l1p = 0;
	    }
	    //scaled_pk = *((double*)(&l1));
	    *((ULONG*)(&scaled_pk)) = l1;
	    //   ix[i] = (unsigned int)(scaled_pk+0.5)| (sign <<lentab[i]);
	   ix[i] = (ULONG)((*((double*)(&l1)))+0.5)| (sign <<lentab[i]);
#ifdef INTERNAL_OUT
    fprintf(stderr, "ix[%d] = %Lx\n",i, ix[i]);
#endif
	}
    fprintf(stderr, "final ix[1] = %Lx\n", ix[1]);
	ix[1] |= (ULONG)(((ULONG)expv) << (lentab[1]+1));
    }else{
	ix[1] = ix[2] = ix[3]=ix[4] = 0;
    }
#ifdef INTERNAL_OUT
    fprintf(stderr, "final ix[1] = %Lx\n", ix[1]);
#endif
    fprintf(stderr, "final ix[1] = %Lx\n", ix[1]);
    return 0;
}

/* This is for x86 (high word in larger address, 32-bit int machine */ 
ULONG convert_predictor_using_e0b2x86_old2(ULONG ix[5], /* converted position, vel... */
				  double x[5], /* position, vel ... */
				  LONG e0b,
				  LONG dtexp)

{
    DATAPACK *xp;
    int  ek0,  expv;
    int i,k;
    int msbloc, sign;
    double scaled_pk;
    ULONG l;
    unsigned int li;
    LONG ixsigned;
    DATAPACK l1;
    /* convert x and set exponent of v */
#ifdef X86    
    ixsigned = x[0]*xscale;
    ix[0] = (ULONG) ixsigned;
#else
    ix[0] = rint(x[0]*xscale);
#endif 
    expv = e0b+xunit;
#ifdef INTERNAL_OUT
#ifdef X86
    fprintf(stderr, "ia32 x[0], xscale, result = %g %g %Lx %x\n", x[0], xscale, ix[0],expv);
#else
    fprintf(stderr, "alphax[0], xscale, result = %g %g %lx\n", x[0], xscale, ix[0]);
#endif
#endif
    xp = (DATAPACK*) x ;
    if(expv > 0){
	for(i=1;i<5;i++){
	    li = xp[i].i32[1];
	    l1.i64 = xp[i].i64;
	    ek0 = (li>>20)&0x7ff;
	    sign = li>>31;
	    msbloc = PRED_V_LEN +ek0-e0b-dtexp*i;
	    if (ek0 != 0) {
		l1.i32[1] =  (((ULONG) msbloc)<<20) | (0xfffffL & l1.i32[1]);
	    }else{
		l1.i32[1] = 0;
	    }
	    ix[i] = ((ULONG)(l1.d+0.5))| (((ULONG)sign) <<lentab[i]);
#ifdef INTERNAL_OUT
    fprintf(stderr, "ix[%d] = %Lx\n",i, ix[i]);
#endif
	}
	ix[1] |= (ULONG)(((ULONG)expv) << (lentab[1]+1));
    }else{
	ix[1] = ix[2] = ix[3]=ix[4] = 0;
    }
#ifdef INTERNAL_OUT
    fprintf(stderr, "final ix[1] = %Lx\n", ix[1]);
#endif
    return 0;
}


/* This is for x86 (high word in larger address, 32-bit int machine */ 
int convert_predictor_using_e0b2x86i32(ULONG *ix, /* converted position*/
				       unsigned int ix32[4],
				       double x[5], /* position, vel ... */
				       int e0b,
				       int dtexp)

{
    DATAPACK *xp;
    int  ek0,  expv;
    int i,k;
    int msbloc, sign;
    double scaled_pk;
    ULONG l;
    unsigned int li;
    LONG ixsigned;
    DATAPACK l1;
    /* convert x and set exponent of v */

    ixsigned = x[0]*xscale;
    *ix = (ULONG) ixsigned;

    expv = e0b+xunit;
#ifdef INTERNAL_OUT
    fprintf(stderr, "ia32 x[0], xscale, result = %g %g %Lx %x\n", x[0], xscale, *ix,expv);
#endif
    xp = (DATAPACK*) x ;
    if(expv > 0){
	for(i=1;i<5;i++){
	    li = xp[i].i32[1];
	    l1.i64 = xp[i].i64;
	    ek0 = (li>>20)&0x7ff;
	    sign = li>>31;
	    msbloc = PRED_V_LEN +ek0-e0b-dtexp*i;
	    if (ek0 != 0) {
		l1.i32[1] =  (((unsigned int) msbloc)<<20) | (0xfffff & l1.i32[1]);
	    }else{
		l1.i32[1] = 0;
	    }
	    ix32[i-1] = ((unsigned int)(l1.d+0.5))| (((unsigned int)sign) <<lentab[i]);
#ifdef INTERNAL_OUT
    fprintf(stderr, "ix32[%d] = %x\n",i-1, ix32[i-1]);
#endif
	}
	ix32[0] |= (((unsigned int)expv) << (lentab[1]+1));
    }else{
	ix32[0] = ix32[1] = ix32[2]=ix32[3] = 0;
    }
#ifdef INTERNAL_OUT
    fprintf(stderr, "final ix[1] = %Lx\n", ix[1]);
#endif
    return 0;
}


/* This is for x86 (high word in larger address, 32-bit int machine */ 
ULONG convert_predictor_using_e0b2x86(ULONG ix[5], /* converted position, vel... */
				  double x[5], /* position, vel ... */
				  LONG e0b,
				  LONG dtexp)

{
    ULONG iret;
    unsigned int ix32[4];
    iret = convert_predictor_using_e0b2x86i32(ix, ix32, x,(int) e0b,
					      (int) dtexp);
    ix[1]=ix32[0];
    ix[2]=ix32[1];
    ix[3]=ix32[2];
    ix[4]=ix32[3];
    return iret;
}
    


ULONG convert_predictor_vector2(ULONG *  tjlsb, /* LSB of tj */
			ULONG * dtjmsb, /* location of MSB of delta ti */
			ULONG ix[3][5], /* converted position, vel... */
			double rtj, /* particle time */
			double rdtj, /* particle timestep */
			double x[3], /* position, vel ... */
			double v[3], /* position, vel ... */
			double aby2[3], /* position, vel ... */
			double a1by6[3], /* position, vel ... */
			double a2by18[3], /* position, vel ... */
			ULONG xunit, /* position resolution= 2**-xunit*/
			ULONG tunit /* time resolution=2**-tinit*/)
{
  
  /* since we use common exponent for a2-v, they should be
     calculated in the same way, except for the difference in the
     power of dt.
     
     For velocity, we have the exponent same as the original velocity,
     since velocity is already normalized and dt does not exceed one.
     In order to prevent possible overflow, mantissa of v should be
     downshifted (to use only 23 bits).
     */
  
  /* first, we calculate time values */
  LONG dtexp,e0b;
  double xdata[3][5];
  LONG e0btmp;
  int i,k;
  ULONG err;
  err = convert_predictor_time2(tjlsb,dtjmsb, &dtexp,  rtj, rdtj);
  e0b = 10000000000;
  e0b = - e0b;
  for(k=0;k<3;k++){
    xdata[k][0] = x[k];
    xdata[k][1] = v[k];
    xdata[k][2] = aby2[k];
    xdata[k][3] = a1by6[k];
    xdata[k][4] = a2by18[k];
    err |= determine_predictor_exponent2(&e0btmp, dtexp,  xdata[k], xunit);
    if (e0btmp > e0b)e0b = e0btmp;
  }
#ifdef INTERNAL_OUT
      fprintf(stderr,"(convert) e0b = %x\n", (int)e0b);
#endif

  for(k=0;k<3;k++){
#ifndef X86
      err |= convert_predictor_using_e0b2(ix[k],  xdata[k],  e0b,  dtexp);
#else      
      err |= convert_predictor_using_e0b2x86(ix[k],  xdata[k],  e0b,  dtexp);
#endif
#ifdef INTERNAL_OUT
#ifdef X86
      fprintf(stderr,"(convert) %3d  %g %Lx\n", k,  xdata[k][0],ix[k][0]);
#else
      fprintf(stderr,"(convert) %3d  %g %lx\n", k,  xdata[k][0],ix[k][0]);
#endif
#endif
  }
  return err;
}


unsigned int
convert_predictor_vector2i32(ULONG *  tjlsb, /* LSB of tj */
			     ULONG * dtjmsb, /* location of MSB of delta ti */
			     ULONG ix[3], /* converted position */
			     unsigned int ix32[3][4], /*converted vel ...*/
			     double rtj, /* particle time */
			     double rdtj, /* particle timestep */
			     double x[3], /* position, vel ... */
			     double v[3], /* position, vel ... */
			     double aby2[3], /* position, vel ... */
			     double a1by6[3], /* position, vel ... */
			     double a2by18[3], /* position, vel ... */
			     ULONG xunit, /* position resolution= 2**-xunit*/
			     ULONG tunit /* time resolution=2**-tinit*/)
{
    
    /* since we use common exponent for a2-v, they should be
       calculated in the same way, except for the difference in the
       power of dt.
       
       For velocity, we have the exponent same as the original velocity,
       since velocity is already normalized and dt does not exceed one.
       In order to prevent possible overflow, mantissa of v should be
       downshifted (to use only 23 bits).
    */
    
    /* first, we calculate time values */
    LONG dtexp,e0b;
    int dtexp32, e0b32;
    double xdata[3][5];
    LONG e0btmp;
    int i,k;
    unsigned int err;
    err = convert_predictor_time2(tjlsb,dtjmsb, &dtexp,  rtj, rdtj);
    e0b = 10000000000;
    e0b = - e0b;
    for(k=0;k<3;k++){
	xdata[k][0] = x[k];
	xdata[k][1] = v[k];
	xdata[k][2] = aby2[k];
	xdata[k][3] = a1by6[k];
	xdata[k][4] = a2by18[k];
	err |= determine_predictor_exponent2(&e0btmp, dtexp,  xdata[k], xunit);
	if (e0btmp > e0b)e0b = e0btmp;
    }
#ifdef INTERNAL_OUT
    fprintf(stderr,"(convert) e0b = %x\n", (int)e0b);
#endif
    dtexp32 = dtexp;
    e0b32 = e0b;
  for(k=0;k<3;k++){
      err |= convert_predictor_using_e0b2x86i32(ix+k,ix32[k],  xdata[k],  e0b32,  dtexp32);
#ifdef INTERNAL_OUT
      fprintf(stderr,"(convert) %3d  %g %Lx\n", k,  xdata[k][0],ix[k][0]);
#endif
  }
  return err;
}

ULONG convert_jp_position(ULONG ix[3], /* converted position */
			double x[3]) /* position, vel ... */
{
    register LONG ixsigned;
    register int k;
    for(k=0;k<3;k++){
#ifdef X86    
	ixsigned = x[k]*xscale;
	ix[k] = (ULONG) ixsigned;
#else
	ix[k] = rint(x[k]*xscale);
#endif 
#ifdef INTERNAL_OUT
#ifdef X86
    fprintf(stderr, "ia32 x[k], xscale, result = %g %g %Lx\n", x[k], xscale, ix[k]);
#else
    fprintf(stderr, "alphax[k], xscale, result = %g %g %lx\n", x[k], xscale, ix[k]);
#endif
#endif    
    }
    return 0;
}

#define JPBUFUNIT (18+cp->jpspace)

int g6_initialize_jp_buffer(int clusterid, int size)
{
    GRAPE6_CLUSTER_PTR cp = clusters[clusterid];
    if (    cp->jp_buffer_size == -1){
	fprintf(stderr,"allocate jp buffer of size %d\n",size);
	cp->jp_buffer_p = malloc(sizeof(int)*JPBUFUNIT*(size+100));
	if (cp->jp_buffer_p == NULL){
	    return -1;
	}
	cp->jp_buffer_size = size;
    }else{
	if (cp->jp_buffer_size <  size){
	    fprintf(stderr,"reallocate jp buffer of size %d\n",size);
	    /*	    fprintf(stderr,"Sorry, increasing jp buffer not implemented yet\n");*/
	    cp->jp_buffer_p = realloc(cp->jp_buffer_p, sizeof(int)*JPBUFUNIT*(size+100));
	    if (cp->jp_buffer_p == NULL){
		return -1;
	    }
	    cp->jp_buffer_size = size;
	}
    }
    cp->jp_buffered_mode = 1;
    cp->jp_buffer_current = 0;
    return 	cp->jp_buffer_size;
}
	
int g6_initialize_jp_buffer_(int* clusterid, int* size) 
{
    return g6_initialize_jp_buffer(*clusterid, *size);
}


int g6_read_jp_buffer_np(int clusterid)
{
    int  i,boardid;
    GRAPE6_CLUSTER_PTR cp;
    cp = clusters[clusterid];
    return cp->jp_buffer_current;
}
int g6_set_jp_buffer_np(int clusterid, int val)
{
    int  i,boardid;
    GRAPE6_CLUSTER_PTR cp;
    cp = clusters[clusterid];
    cp->jp_buffer_current = val;
    return val;
}
unsigned int * g6_get_jp_buffer_ptr(int clusterid)
{
    return clusters[clusterid]->jp_buffer_p;
}
int g6_get_jpspace(int clusterid)
{
    return clusters[clusterid]->jpspace;
}
int g6_get_nchips(int clusterid)
{
    return clusters[clusterid]->nchipsperboard;
}
    

int g6_flush_jp_buffer(int  clusterid)
{
    int  i,boardid;
    GRAPE6_CLUSTER_PTR cp;
    int loc = 0;
    int len;
    int plen;
    int nblock;
    cp = clusters[clusterid];
    plen = JPBUFUNIT;
    if ((cp->jp_buffered_mode == 0) || (cp->jp_buffer_current == 0)) return 0;
    
    boardid = cp->ipboardid;
    hib_sending_jp[boardid] = 1;
    nblock = NBLOCKMAX;
    if (cp->jpspace == 0) nblock = 1;
    if(cp->ijpdma == 0){
#ifdef X86	
	PCIdummyaccess(boardid);
	dummyloop(50); /* dummy wait needed at least on X86 */
#endif
	for(i=0;i<cp->jp_buffer_current-nblock; i+=nblock){
	    linkwrite(boardid,hib_jplinkid[boardid],plen*nblock,
		      cp->jp_buffer_p+i*plen);
	    loc += nblock;
	    if (nblock == 1){
#ifdef X86
		PCIdummyaccess(boardid);
		dummyloop(200);
#else
		MB;
#endif		
	    }

	}
	for(i=loc;i<cp->jp_buffer_current; i++){
	    linkwrite(boardid,hib_jplinkid[boardid],plen,cp->jp_buffer_p+i*plen);
	    if (nblock == 1){
#ifdef X86
		PCIdummyaccess(boardid);
		dummyloop(200);
#else
		MB;
#endif		
	    }
	}
    }else{
	/*	sethibdmamode(boardid);*/
	wait_dma_to_finish(boardid);
	for(i=0;i<cp->jp_buffer_current-nblock; i+=nblock){
	    linkwrite_dma(clusterid,hib_jplinkid[boardid],plen*nblock,
			  cp->jp_buffer_p+i*plen);
	    loc += nblock;
	    nwait(plen*nblock*(1000/JPWAITFACTOR));
	    wait_dma_to_finish(boardid);
	}
	len =  cp->jp_buffer_current-loc;
	if (len > 0){
	    linkwrite_dma(clusterid,hib_jplinkid[boardid],plen*len,
			  cp->jp_buffer_p+loc*plen);
	    nwait(plen*nblock*(1000/JPWAITFACTOR));
	}
	wait_dma_to_finish(boardid);
    }
    /*    sethibdmamode(boardid);*/
    cp->jp_buffer_current = 0;
    hib_sending_jp[boardid] = 0;
    uwait(3);
    g6reset_pbonly(boardid);
    return 0;
}

int g6_flush_jp_buffer_(int*  clusterid)
{
    return g6_flush_jp_buffer(*clusterid);
}

static int jpbuf_dump_mode = 0;
void g6_set_jpbuf_dump_mode(int mode)
{
    jpbuf_dump_mode = mode;
}


int g6_flush_jp_buffer_and_multisend(int  clusterid, int nclusters)
{
#define MAXNCLUSTERS 5    
    int  i;
    GRAPE6_CLUSTER_PTR cp;
    int loc = 0;
    int len;
    int plen;
    int nblock;
    int ic;
    int boardid[MAXNCLUSTERS];
    if (nclusters >  MAXNCLUSTERS){
	fprintf(stderr,"g6_flush_jp_buffer_and_multisend: too large nclusters = %d\n",
		nclusters);
	exit(1);
    }
    cp = clusters[clusterid];
    plen = JPBUFUNIT;
    for (i=0;i<nclusters;i++){
	boardid[i] = clusters[clusterid+i]->ipboardid;
	hib_sending_jp[boardid[i]] = 1;
	PCIdummyaccess(boardid[i]);
    }
    if(jpbuf_dump_mode){
	int j;
	fprintf(stderr,"Dumping JP BUFFER\n");
	for(i=0;i<cp->jp_buffer_current;i++){
	    fprintf(stderr,"Particle %d\n",i);
	    for(j=0;j<18;j++){
		fprintf(stderr," %8x",*(cp->jp_buffer_p+i*plen+j));
		if (j  == 8)fprintf(stderr,"\n");
	    }
	    fprintf(stderr,"\n");
	}
    }
    nblock = NBLOCKMAX;
    if (cp->jpspace == 0) nblock = 1;
    dummyloop(50); /* dummy wait needed at least on X86 */
    
    
    if (cp->ijpdma){
	for(ic = 0;ic < nclusters;ic++)wait_dma_to_finish(boardid[ic]);
    }
    for(i=0;i<cp->jp_buffer_current; i+=nblock){
	int nsend = nblock;
	if (nsend > cp->jp_buffer_current - i) nsend = cp->jp_buffer_current - i;
	loc +=nblock;
	for(ic = 0;ic < nclusters;ic++){
	    if (cp->ijpdma){
		linkwrite_dma(clusterid+ic,hib_jplinkid[boardid[ic]],plen*nsend,cp->jp_buffer_p+i*plen);
	    }else{
		linkwrite(boardid[ic],hib_jplinkid[boardid[ic]],plen*nsend,cp->jp_buffer_p+i*plen);
		if (nblock == 1) PCIdummyaccess(boardid[ic]);
	    }		
	}
	if (cp->ijpdma){
	    nwait(plen*nblock*(1000/JPWAITFACTOR));
	    for(ic = 0;ic < nclusters;ic++)wait_dma_to_finish(boardid[ic]);
	}
	
#ifdef X86
	if ((nclusters == 1) && (nblock == 1))dummyloop(200); /* dummy wait needed at least on X86 */
#else
	MB;
#endif	
    }
    cp->jp_buffer_current = 0;
    
    for (i=0;i<nclusters;i++){
	hib_sending_jp[boardid[i]] = 0;
	g6reset_pbonly(boardid[i]);
    }
    return 0;
}

int g6_flush_jp_buffer_and_multisend_async(int  clusterid, int nclusters,
					   int nnsend)
{
#define MAXNCLUSTERS 5    
    int  i;
    GRAPE6_CLUSTER_PTR cp;
    int loc = 0;
    int len;
    int plen;
    int nblock;
    int ic;
    int boardid[MAXNCLUSTERS];
    static int finished = 1;
    static int current_loc = 0;
    int iend;
    if (finished){
	finished = 0;
	current_loc = 0;
	if (nclusters >  MAXNCLUSTERS){
	    fprintf(stderr,"g6_flush_jp_buffer_and_multisend: too large nclusters = %d\n",
		    nclusters);
	    exit(1);
	}
	cp = clusters[clusterid];
	plen = JPBUFUNIT;
	for (i=0;i<nclusters;i++){
	    boardid[i] = clusters[clusterid+i]->ipboardid;
	    hib_sending_jp[boardid[i]] = 1;
	    PCIdummyaccess(boardid[i]);
	}
	if(jpbuf_dump_mode){
	    int j;
	    fprintf(stderr,"Dumping JP BUFFER\n");
	    for(i=0;i<cp->jp_buffer_current;i++){
		fprintf(stderr,"Particle %d\n",i);
		for(j=0;j<18;j++){
		    fprintf(stderr," %8x",*(cp->jp_buffer_p+i*plen+j));
		    if (j  == 8)fprintf(stderr,"\n");
		}
		fprintf(stderr,"\n");
	    }
	}
	dummyloop(50); /* dummy wait needed at least on X86 */
    }
    
    nblock = NBLOCKMAX;
    if (cp->jpspace == 0) nblock = 1;
    
    if (cp->ijpdma){
	for(ic = 0;ic < nclusters;ic++)wait_dma_to_finish(boardid[ic]);
    }
    iend = cp->jp_buffer_current;
    if (iend > current_loc + nnsend) iend = current_loc + nnsend;
    
    for(i=current_loc;i<iend; i+=nblock){
	int nsend = nblock;
	if (nsend > cp->jp_buffer_current - i) nsend = cp->jp_buffer_current - i;
	loc +=nblock;
	for(ic = 0;ic < nclusters;ic++){
	    if (cp->ijpdma){
		linkwrite_dma(clusterid+ic,hib_jplinkid[boardid[ic]],plen*nsend,cp->jp_buffer_p+i*plen);
	    }else{
		linkwrite(boardid[ic],hib_jplinkid[boardid[ic]],plen*nsend,cp->jp_buffer_p+i*plen);
		if (nblock == 1) PCIdummyaccess(boardid[ic]);
	    }		
	}
	if (cp->ijpdma){
	    nwait(plen*nblock*(1000/JPWAITFACTOR));
	    for(ic = 0;ic < nclusters;ic++)wait_dma_to_finish(boardid[ic]);
	}
	current_loc += nsend;
#ifdef X86
	if ((nclusters == 1) && (nblock == 1))dummyloop(200); /* dummy wait needed at least on X86 */
#else
	MB;
#endif	
    }
    if (current_loc == cp->jp_buffer_current){
	cp->jp_buffer_current = 0;
	finished = 1;
	for (i=0;i<nclusters;i++){
	    hib_sending_jp[boardid[i]] = 0;
	    g6reset_pbonly(boardid[i]);
	}
    }
    return finished;
}

int g6_set_j_particle_multisend_mxfast_(int * clusterid,
				 int * nclusters,
				 int *address,
				 int *index,
				 double *mass,
				 double x[3] /* position */)
{
    unsigned int jpdata[26],nword,ichip,mem_adr;
    float fmass;
    ULONG ix[3];
    int nchips, i,k, boardid;
    unsigned int * jpp;
    GRAPE6_CLUSTER_PTR cp;
    int ic;
    int retcode = 0;
#ifdef SIMULATOR    
    struct jparticle * jp;
#endif
    cp = clusters[*clusterid];
    nchips = clusters[*clusterid]->nchipsperboard;
    convert_jp_position(ix,x);

    fmass = *mass;
    
    ichip = (*address)%nchips ;
    mem_adr = (*address)/nchips ;
#ifdef INTERNAL_OUT
    fprintf(stderr,"address %d ichip %d mem %d\n",(*address),ichip,mem_adr);
#endif

    nword = JPBUFUNIT;

    if (cp->jp_buffered_mode){
	jpp = cp->jp_buffer_p+((cp->jp_buffer_current)*JPBUFUNIT);
    }else{
	jpp = jpdata;
    }
    jpp[0] = 0xffc00 | ichip;
    jpp[1] = mem_adr<<3;
    jpp[2] = ix[0]; 
    jpp[3] = (ix[0]>>32); 
    jpp[4] = ix[1]; 
    jpp[5] = (ix[1]>>32); 
    jpp[6] = ix[2]; 
    jpp[7] = (ix[2]>>32); 
    jpp[8] = 0;
    jpp[9] = 0;
    jpp[10] = 0;
    jpp[11] = 0;
    jpp[12] = 0;
    jpp[13] = 0;
    jpp[14] = 0;
    jpp[15] = 0;
    jpp[16] = *((unsigned int *)(&fmass)) ;
    jpp[17] = *index;
    if (cp->jp_buffered_mode ){
	cp->jp_buffer_current++;
    }else{
	for(ic=0;ic< *nclusters;ic++){
	    boardid = clusters[*clusterid + ic]->ipboardid;
	    hib_sending_jp[boardid] = 1;
#ifdef INTERNAL_OUT    
	    fprintf(stderr,"set sending_jp = 1 for boardid = %d\n", boardid);
	    for(i=0;i<18;i++){
		fprintf(stderr,"%3d %8x ", i, jpdata[i]);
		if ((i%6)==5)fprintf(stderr,"\n");
	    }
#endif
	    linkwrite(boardid,hib_jplinkid[boardid],nword,jpdata);
	    PCIdummyaccess(boardid);
	    hib_sending_jp[boardid] = 0;
	}
    }
#ifdef JPIPTEST
    for(i=0;i<18;i++){
	fprintf(stderr,"%3d %8x ", i, jpp[i]);
	if ((i%6)==5)fprintf(stderr,"\n");
    }
#endif    
#ifdef SIMULATOR
    if (cp->simg6p->use_simulator){
	
	jp = cp->simg6p->jmem+ *address;
	jp->tjlsb = 0;
	jp->dtjmsb = 0;
	jp->mass = convert_double_to_grape_float(*mass, INTERACTION_F_LEN_U);
	jp->index = *index;
	for(k=0;k<3;k++){
	    jp->ix[k][0] = ix[k];
	    for(i=1;i<5;i++)jp->ix[k][i] = 0;
	}
    } 
#endif	
  return retcode;
}


#if PREFER32BIT
int g6_set_j_particle_multisend_(int * clusterid,
				 int * nclusters,
				 int *address,
				 int *index,
				 double *tj, /* particle time */
				 double *dtj, /* particle time */
				 double *mass,
				 double a2by18[3], /* a2dot divided by 18 */
				 double a1by6[3], /* a1dot divided by 6 */
				 double aby2[3], /* a divided by 2 */
				 double v[3], /* velocity */
				 double x[3] /* position */)
{
    unsigned int jpdata[20],nword,ichip,mem_adr;
    float fmass;
    ULONG tjlsb,dtjmsb,ix[3];
    unsigned int ix32[3][4],retcode;
    int nchips, i,k, boardid;
    unsigned int * jpp;
    GRAPE6_CLUSTER_PTR cp;
    int ic;
#ifdef SIMULATOR    
    struct jparticle * jp;
#endif
    cp = clusters[*clusterid];
    nchips = clusters[*clusterid]->nchipsperboard;
    retcode = convert_predictor_vector2i32(&tjlsb, &dtjmsb, ix,ix32,
				       *tj, *dtj, x, v, aby2, a1by6, a2by18,
				       xunit, tunit);
#ifdef INTERNAL_OUT
    fprintf(stderr,"g6_set_j_particle, %d %d %d %g %g %x %x\n", *clusterid, *address, *index,
	    *tj, *dtj, tjlsb, dtjmsb);
    fprintf(stderr,"x %g %g %g %lx %lx %lx\n",x[0],x[1],x[2],ix[0][0],ix[1][0],ix[2][0]);
    fprintf(stderr,"v %g %g %g %x %x %x\n",v[0],v[1],v[2],ix[0][1],ix[1][1],ix[2][1]);
    fprintf(stderr,"a %g %g %g %x %x %x\n",aby2[0],aby2[1],aby2[2],       ix[0][2],ix[1][2],ix[2][2]);
    fprintf(stderr,"j %g %g %g %x %x %x\n",a1by6[0],a1by6[1],a1by6[2],    ix[0][3],ix[1][3],ix[2][3]);
    fprintf(stderr,"j2 %g %g %g %x %x %x\n",a2by18[0],a2by18[1],a2by18[2],ix[0][4],ix[1][4],ix[2][4]);
#endif    
    fmass = *mass;
    nword = JPBUFUNIT;
    
    ichip = (*address)%nchips ;
    mem_adr = (*address)/nchips ;
#ifdef INTERNAL_OUT
    fprintf(stderr,"address %d ichip %d mem %d\n",(*address),ichip,mem_adr);
#endif
#ifdef JPIPTEST
#ifdef X86    
    fprintf(stderr,"JP data for index %d: %Lx\n", *index, ix[0]);
#else    
    fprintf(stderr,"JP data for index %d: %lx\n", *index, ix[0]);
#endif
#endif
    if (cp->jp_buffered_mode){
	jpp = cp->jp_buffer_p+((cp->jp_buffer_current)*JPBUFUNIT);
    }else{
	jpp = jpdata;
    }
    jpp[0] = 0xffc00 | ichip;
    jpp[1] = mem_adr<<3;
    jpp[2] = ix[0]; 
    jpp[3] = (ix[0]>>32); 
    jpp[4] = ix[1]; 
    jpp[5] = (ix[1]>>32); 
    jpp[6] = ix[2]; 
    jpp[7] = (ix[2]>>32); 
    jpp[8] =  ix32[0][0];
    jpp[9] =  ix32[1][0];
    jpp[10] = ix32[2][0];
    jpp[11] = (ix32[1][1]<<21) | ix32[0][1];
    jpp[12] = (ix32[0][2]<<31) | (ix32[2][1]<<10) | (ix32[1][1]>>11);
    jpp[13] = (ix32[1][2]<<16) | (ix32[0][2]>>1);
    jpp[14] = (ix32[1][3]<<29) | (ix32[0][3]<<18) | (ix32[2][2]<<1) | (ix32[1][2]>>16);
    jpp[15] = (tjlsb<<25) | (dtjmsb<<19) | (ix32[2][3]<<8) | (ix32[1][3]>>3);
    jpp[16] = *((unsigned int *)(&fmass)) ;
    jpp[17] = *index;
    if (cp->jp_buffered_mode ){
	cp->jp_buffer_current++;
    }else{
	for(ic=0;ic< *nclusters;ic++){
	    boardid = clusters[*clusterid + ic]->ipboardid;
	    hib_sending_jp[boardid] = 1;
#ifdef INTERNAL_OUT    
	    fprintf(stderr,"set sending_jp = 1 for boardid = %d\n", boardid);
	    for(i=0;i<18;i++){
		fprintf(stderr,"%3d %8x ", i, jpdata[i]);
		if ((i%6)==5)fprintf(stderr,"\n");
	    }
#endif
	    linkwrite(boardid,hib_jplinkid[boardid],nword,jpdata);
	    PCIdummyaccess(boardid);
	    nwait(100);
	    hib_sending_jp[boardid] = 0;
	}
    }
#ifdef JPIPTEST
    for(i=0;i<18;i++){
	fprintf(stderr,"%3d %8x ", i, jpp[i]);
	if ((i%6)==5)fprintf(stderr,"\n");
    }
#endif    
#ifdef SIMULATOR
    if (cp->simg6p->use_simulator){
	
	jp = cp->simg6p->jmem+ *address;
	jp->tjlsb = tjlsb;
	jp->dtjmsb = dtjmsb;
	jp->mass = convert_double_to_grape_float(*mass, INTERACTION_F_LEN_U);
	jp->index = *index;
	for(k=0;k<3;k++){
	    jp->ix[k][0]=ix[k];
	    for(i=1;i<5;i++)jp->ix[k][i] = ix32[k][i-1];
	}
    }
#endif

  return retcode;
}
#else
/* 64 bit int version */
int g6_set_j_particle_multisend_(int * clusterid,
				 int * nclusters,
				 int *address,
				 int *index,
				 double *tj, /* particle time */
				 double *dtj, /* particle time */
				 double *mass,
				 double a2by18[3], /* a2dot divided by 18 */
				 double a1by6[3], /* a1dot divided by 6 */
				 double aby2[3], /* a divided by 2 */
				 double v[3], /* velocity */
				 double x[3] /* position */)
{
    unsigned int jpdata[20],nword,ichip,mem_adr;
    float fmass;
    ULONG tjlsb,dtjmsb,ix[3][5],retcode;
    int nchips, i,k, boardid;
    unsigned int * jpp;
    GRAPE6_CLUSTER_PTR cp;
    int ic;
#ifdef SIMULATOR    
    struct jparticle * jp;
#endif
    cp = clusters[*clusterid];
    nchips = clusters[*clusterid]->nchipsperboard;
    retcode = convert_predictor_vector2(&tjlsb, &dtjmsb, ix,
				       *tj, *dtj, x, v, aby2, a1by6, a2by18,
				       xunit, tunit);
#ifdef INTERNAL_OUT
    fprintf(stderr,"g6_set_j_particle, %d %d %d %g %g %x %x\n", *clusterid, *address, *index,
	    *tj, *dtj, tjlsb, dtjmsb);
    fprintf(stderr,"x %g %g %g %lx %lx %lx\n",x[0],x[1],x[2],ix[0][0],ix[1][0],ix[2][0]);
    fprintf(stderr,"v %g %g %g %x %x %x\n",v[0],v[1],v[2],ix[0][1],ix[1][1],ix[2][1]);
    fprintf(stderr,"a %g %g %g %x %x %x\n",aby2[0],aby2[1],aby2[2],       ix[0][2],ix[1][2],ix[2][2]);
    fprintf(stderr,"j %g %g %g %x %x %x\n",a1by6[0],a1by6[1],a1by6[2],    ix[0][3],ix[1][3],ix[2][3]);
    fprintf(stderr,"j2 %g %g %g %x %x %x\n",a2by18[0],a2by18[1],a2by18[2],ix[0][4],ix[1][4],ix[2][4]);
#endif    
    fmass = *mass;
    nword = JPBUFUNIT;
    
    ichip = (*address)%nchips ;
    mem_adr = (*address)/nchips ;
#ifdef INTERNAL_OUT
    fprintf(stderr,"address %d ichip %d mem %d\n",(*address),ichip,mem_adr);
#endif
#ifdef JPIPTEST
#ifdef X86    
    fprintf(stderr,"JP data for index %d: %Lx\n", *index, ix[0][0]);
#else    
    fprintf(stderr,"JP data for index %d: %lx\n", *index, ix[0][0]);
#endif
#endif
    if (cp->jp_buffered_mode){
	jpp = cp->jp_buffer_p+((cp->jp_buffer_current)*JPBUFUNIT);
    }else{
	jpp = jpdata;
    }
    jpp[0] = 0xffc00 | ichip;
    jpp[1] = mem_adr<<3;
    jpp[2] = ix[0][0]; 
    jpp[3] = (ix[0][0]>>32); 
    jpp[4] = ix[1][0]; 
    jpp[5] = (ix[1][0]>>32); 
    jpp[6] = ix[2][0]; 
    jpp[7] = (ix[2][0]>>32); 
    jpp[8] = ix[0][1];
    jpp[9] = ix[1][1];
    jpp[10] = ix[2][1];
    jpp[11] = (ix[1][2]<<21) | ix[0][2];
    jpp[12] = (ix[0][3]<<31) | (ix[2][2]<<10) | (ix[1][2]>>11);
    jpp[13] = (ix[1][3]<<16) | (ix[0][3]>>1);
    jpp[14] = (ix[1][4]<<29) | (ix[0][4]<<18) | (ix[2][3]<<1) | (ix[1][3]>>16);
    jpp[15] = (tjlsb<<25) | (dtjmsb<<19) | (ix[2][4]<<8) | (ix[1][4]>>3);
    jpp[16] = *((unsigned int *)(&fmass)) ;
    jpp[17] = *index;
    if (cp->jp_buffered_mode ){
	cp->jp_buffer_current++;
    }else{
	for(ic=0;ic< *nclusters;ic++){
	    boardid = clusters[*clusterid + ic]->ipboardid;
	    hib_sending_jp[boardid] = 1;
#ifdef INTERNAL_OUT    
	    fprintf(stderr,"set sending_jp = 1 for boardid = %d\n", boardid);
	    for(i=0;i<18;i++){
		fprintf(stderr,"%3d %8x ", i, jpdata[i]);
		if ((i%6)==5)fprintf(stderr,"\n");
	    }
#endif
	    linkwrite(boardid,hib_jplinkid[boardid],nword,jpdata);
	    PCIdummyaccess(boardid);
	    nwait(100);
	    hib_sending_jp[boardid] = 0;
	}
    }
#ifdef JPIPTEST
    for(i=0;i<18;i++){
	fprintf(stderr,"%3d %8x ", i, jpp[i]);
	if ((i%6)==5)fprintf(stderr,"\n");
    }
#endif    
#ifdef SIMULATOR
    if (cp->simg6p->use_simulator){
	
	jp = cp->simg6p->jmem+ *address;
	jp->tjlsb = tjlsb;
	jp->dtjmsb = dtjmsb;
	jp->mass = convert_double_to_grape_float(*mass, INTERACTION_F_LEN_U);
	jp->index = *index;
	for(k=0;k<3;k++)for(i=0;i<5;i++)jp->ix[k][i] = ix[k][i];
    }
#endif

  return retcode;
}
#endif
int g6_set_j_particle_(int * clusterid,
		       int *address,
		       int *index,
		       double *tj, /* particle time */
		       double *dtj, /* particle time */
		       double *mass,
		       double a2by18[3], /* a2dot divided by 18 */
		       double a1by6[3], /* a1dot divided by 6 */
		       double aby2[3], /* a divided by 2 */
		       double v[3], /* velocity */
		       double x[3] /* position */)
{
    int one = 1;
    return g6_set_j_particle_multisend_(clusterid,&one,address,index,tj,dtj,mass,
			      a2by18, a1by6, aby2, v, x);
}


int g6_set_j_particle(int  clusterid,
		       int address,
		       int index,
		       double tj, /* particle time */
		       double dtj, /* particle time */
		       double mass,
		       double a2by18[3], /* a2dot divided by 18 */
		       double a1by6[3], /* a1dot divided by 6 */
		       double aby2[3], /* a divided by 2 */
		       double v[3], /* velocity */
		       double x[3] /* position */)
{
    return g6_set_j_particle_(&clusterid,&address,&index,&tj,&dtj,&mass,
			      a2by18, a1by6, aby2, v, x);
}

int g6_set_j_particle_multisend(int  clusterid,
				int  nclusters,
		       int address,
		       int index,
		       double tj, /* particle time */
		       double dtj, /* particle time */
		       double mass,
		       double a2by18[3], /* a2dot divided by 18 */
		       double a1by6[3], /* a1dot divided by 6 */
		       double aby2[3], /* a divided by 2 */
		       double v[3], /* velocity */
		       double x[3] /* position */)
{
    return g6_set_j_particle_multisend_(&clusterid,&nclusters,&address,&index,&tj,&dtj,&mass,
			      a2by18, a1by6, aby2, v, x);
}


int g6_set_j_particle_mxonly(int  clusterid,
		       int address,
		       int index,
		       double mass,
		       double x[3] /* position */)
{
    double tj = 0.0;
    double dtj = 1.0;
    double a2by18[3];
    double a1by6[3];
    double aby2[3];
    double v[3];
    int k;
    for(k=0;k<3;k++){
	a2by18[k] = 0.0;
	a1by6[k] = 0.0;
	aby2[k] = 0.0;
	v[k] = 0.0;
    }	
    return g6_set_j_particle_(&clusterid,&address,&index,&tj,&dtj,&mass,
			      a2by18, a1by6, aby2, v, x);
}
int g6_set_j_particle_mxonly_(int* clusterid,
			      int* address,
			      int* index,
			      double* mass,
			      double x[3] /* position */)
{
    double tj = 0.0;
    double dtj = 1.0;
    double a2by18[3];
    double a1by6[3];
    double aby2[3];
    double v[3];
    int k;
    for(k=0;k<3;k++){
	a2by18[k] = 0.0;
	a1by6[k] = 0.0;
	aby2[k] = 0.0;
	v[k] = 0.0;
    }	
    return g6_set_j_particle_(clusterid,address,index,&tj,&dtj,mass,
			      a2by18, a1by6, aby2, v, x);
}


void g6_set_nip_(int * clusterid, int * nip)
{
  unsigned int ipdata[10];
  unsigned int ni;
#ifndef   DOUBLE_CHECK_RESULT
  ni = *nip;
#else  
  ni = (*nip)*2;
#endif
  ipdata[0] = NIP_ADDRESS;                 
  ipdata[1] = 0x1;                    
  ipdata[2] = ni;
#ifdef DMA
  wait_dma_to_finish(clusters[*clusterid]->ipboardid);
#endif  
  ipdatawrite(clusters[*clusterid]->ipboardid,ipdata);
  clusters[*clusterid]->ni = *nip; 
}

static void fill_zero_to_unused_memory_locations(int * clusterid, int nj)
{
    int address;
    int index;
    int ncount;
    int i;
    double tj = 0;
    double dtj  = 0.125;
    double mass = 0;
    double xval = 300;
    double a2by18[3], a1by6[3], aby2[3],  v[3], x[3] ;
    GRAPE6_CLUSTER_PTR cp;
    cp = clusters[*clusterid];
    if ((nj % cp->nchipsperboard) == 0) return;
    ncount =  cp->nchipsperboard - (nj % cp->nchipsperboard);
    if(xunit > 50){
	xval /= (1<<(xunit-50));
    }else{
	xval *= (1<<(50-xunit));
    }
    /*    fprintf(stderr,"xval, xunit = %e %d\n", xval, xunit);*/
    for(i=0;i<3;i++){
        a2by18[i] = a1by6[i]= aby2[i]= v[i]=0;
	x[i] = xval;
    }

    for(i=0; i<ncount; i++){
	address = nj+i;
	index   = nj+i+1;
#ifdef INTERNAL_OUT0
	fprintf(stderr,"(fill_zero) addres = %d\n", address);
#endif
	g6_set_j_particle_(clusterid,&address,&index,&tj,&dtj,&mass,
			   a2by18, a1by6, aby2, v, x);
    }
    if (cp->jp_buffered_mode){
	g6_flush_jp_buffer_and_multisend(*clusterid,1);
    }
}

void g6_setup_njdata_(int * clusterid, int * njp)
{
    unsigned int ipdata[10],nj;
    GRAPE6_CLUSTER_PTR cp;
    cp = clusters[*clusterid];
    /*g6hib_foerror_clear(cp->foboardid) ;*/
    if (cp->nj != *njp){
	fill_zero_to_unused_memory_locations(clusterid, *njp);
	cp->nj = *njp;
    }
}
void g6_setup_njdata(int clusterid, int njp)
{
    g6_setup_njdata_(&clusterid, &njp);
}

void g6_fillzero_njdata(int clusterid, int nj)
{
    GRAPE6_CLUSTER_PTR cp;
    cp = clusters[clusterid];
    fill_zero_to_unused_memory_locations(&clusterid, nj);
}
void g6_set_njp_real_(int * clusterid, int * njp) 
{
    unsigned int ipdata[10],nj;
    GRAPE6_CLUSTER_PTR cp;
    cp = clusters[*clusterid];
    counterclear(cp->foboardid);
    /*g6hib_foerror_clear(cp->foboardid) ;*/
    nj = (*njp +cp->nchipsperboard-1)/cp->nchipsperboard;
    if (jpbuf_dump_mode)fprintf(stderr,"nj = %d\n",nj);
    ipdata[0] = 0x1402;                 
    ipdata[1] = 0x1;                    
    ipdata[2] = nj;
    ipdatawrite(cp->ipboardid,ipdata);
#ifdef SIMULATOR
    if (cp->simg6p->use_simulator){
	run_simulated_cluster(cp->simg6p, cp->nj, cp->ni, cp->nchips);
    }
#endif /*SIMULATOR*/
}

void g6_set_njp_(int * clusterid, int * njp)
{
    g6_setup_njdata_(clusterid, njp);
    g6_set_njp_real_(clusterid, njp);
}


double fabs(double);
void g6_set_i_particle_scales_from_real_value_(int * clusterid,
					       int *address,
					       double acc[3],
					       double jerk[3],
					       double *phi,
					       double *jfactor,
					       double * ffactor)
{
#define SMALLNUM (1e-50)   
    int k,i,l;
    register int k1,k2,k3;
    int * fp;  
    int * jp; 
    int * pp; 
    GRAPE6_CLUSTER_PTR cp = clusters[*clusterid];
    double amax = fabs(acc[0]);
    double jmax = fabs(jerk[0])*(*jfactor) + fabs(acc[0])*(*ffactor);
    i = *address;
    for(k=1;k<3;k++){
	double jtmp = fabs(jerk[k])*(*jfactor)+ fabs(acc[k])*(*ffactor);
	if(fabs(acc[k])>amax)amax = fabs(acc[k]);
	if(jtmp>jmax)jmax = jtmp;
    }
    if (jmax == 0.0) jmax = 1;
#ifndef X86
    frexp(amax, &k);
    cp->fscale[i] = 52 - k - 513 + 2*xunit;
    frexp(jmax, &k);
    cp->jscale[i] = 26 - k - 513 + 2*xunit + tunit;
    frexp(*phi, &k);
    cp->phiscale[i] = 52 - k - 513 + xunit;
#else
    /* At least with gcc2.91.66+x86 (P4) this is slightly faster.... */
    fp = ((int*)(&amax))+1;
    jp = ((int*)(&jmax))+1;
    pp = ((int*)(phi))+1;
    k1 = ((*fp)>>20) & 0x7ff;
    k2 = ((*jp)>>20) & 0x7ff;
    k3 = ((*pp)>>20) & 0x7ff;
    cp->fscale[i]  = 52 -  513 + 1022+ 2*xunit -k1;
    cp->jscale[i] = 26 -  513 + 1022+ 2*xunit + tunit -k2;
    cp->phiscale[i] = 52 -  513 +1022+ xunit-k3;

#endif    
    if (jpbuf_dump_mode){
	fprintf(stderr,"scaling xunit, tunit = %d %d\n", xunit, tunit);
	fprintf(stderr,"a %e %e %e %e\n", acc[0],acc[1],acc[2],amax);
	fprintf(stderr,"j %e %e %e %e\n", jerk[0],jerk[1],jerk[2],jmax);
	fprintf(stderr,"i %d fs %x %d ps %x %d js %x %d \n",
	       i,0x3ff&cp->fscale[i],cp->fscale[i],0x3ff&cp->phiscale[i],
	       cp->phiscale[i],0x3ff&cp->jscale[i],cp->jscale[i]);
    }
    
}
void g6_calc_i_particle_scales_from_real_value_(int * clusterid,
					       int *address,
					       double acc[3],
					       double jerk[3],
					       double *phi,
					       double *jfactor,
					       double * ffactor,
						int* fscale,
						int* jscale,
						int* phiscale)
{
#define SMALLNUM (1e-50)   
    int k,i,l;
    register int k1,k2,k3;
    int * fp;  
    int * jp; 
    int * pp; 
    GRAPE6_CLUSTER_PTR cp = clusters[*clusterid];
    double amax = fabs(acc[0]);
    double jmax = fabs(jerk[0])*(*jfactor) + fabs(acc[0])*(*ffactor);
    i = *address;
    for(k=1;k<3;k++){
	double jtmp = fabs(jerk[k])*(*jfactor)+ fabs(acc[k])*(*ffactor);
	if(fabs(acc[k])>amax)amax = fabs(acc[k]);
	if(jtmp>jmax)jmax = jtmp;
    }
    if (jmax == 0.0) jmax = 1;
#ifndef X86
    frexp(amax, &k);
    cp->fscale[i] = 52 - k - 513 + 2*xunit;
    frexp(jmax, &k);
    cp->jscale[i] = 26 - k - 513 + 2*xunit + tunit;
    frexp(*phi, &k);
    cp->phiscale[i] = 52 - k - 513 + xunit;
#else
    /* At least with gcc2.91.66+x86 (P4) this is slightly faster.... */
    fp = ((int*)(&amax))+1;
    jp = ((int*)(&jmax))+1;
    pp = ((int*)(phi))+1;
    k1 = ((*fp)>>20) & 0x7ff;
    k2 = ((*jp)>>20) & 0x7ff;
    k3 = ((*pp)>>20) & 0x7ff;
    fscale[i]  = 52 -  513 + 1022+ 2*xunit -k1;
    jscale[i] = 26 -  513 + 1022+ 2*xunit + tunit -k2;
    phiscale[i] = 52 -  513 +1022+ xunit-k3;

#endif    
    if (jpbuf_dump_mode){
	fprintf(stderr,"scaling xunit, tunit = %d %d\n", xunit, tunit);
	fprintf(stderr,"i %d fs %x %d ps %x %d js %x %d \n",
	       i,0x3ff&fscale[i],fscale[i],0x3ff&phiscale[i],phiscale[i],0x3ff&jscale[i],jscale[i]);
    }
}

static int fsbuf[NCLUSTERS][96];
static int jsbuf[NCLUSTERS][96];
static int psbuf[NCLUSTERS][96];
static int sbufset = 0;

void g6_preset_i_particle_scales_from_real_value_(int * clusterid,
					       int *address,
					       double acc[3],
					       double jerk[3],
					       double *phi,
					       double *jfactor,
					       double * ffactor)
{
     g6_calc_i_particle_scales_from_real_value_(clusterid, address, acc,jerk,
						phi,jfactor,ffactor,
						fsbuf[*clusterid],
						jsbuf[*clusterid],
						psbuf[*clusterid]);
}
void g6_preset_i_particle_scales_from_real_value(int clusterid,
					       int address,
					       double acc[3],
					       double jerk[3],
					       double *phi)
{
    double six = 6.0;
    double zero = 0.0;
    g6_calc_i_particle_scales_from_real_value_(&clusterid, &address, acc,jerk,
						phi,&six,&zero,
						fsbuf[clusterid],
						jsbuf[clusterid],
						psbuf[clusterid]);
}

void g6_copy_i_particle_scales_(int * clusterid,
				int *address,
				int* fscale,
				int* jscale,
				int* phiscale)
{
    int k,i,l;
    register int k1,k2,k3;
    GRAPE6_CLUSTER_PTR cp = clusters[*clusterid];
    i = *address;
    cp->fscale[i] = fscale[i];
    cp->jscale[i] =jscale[i] ;
    cp->phiscale[i] =phiscale[i];
#if 0
    printf("c %d fs %x %d ps %x %d js %x %d \n",
	i,0x3ff&cp->fscale[i],cp->fscale[i],0x3ff&cp->phiscale[i],
	   cp->phiscale[i],0x3ff&cp->jscale[i],cp->jscale[i]);
#endif
}

void g6_set_sbufset(int val)
{
    sbufset = val;
}

void g6_set_ip_scales_(int * clusterid,
		       int *address,
		       double acc[3],
		       double jerk[3],
		       double *phi,
		       double *jfactor,
		       double *ffactor)
{
    g6_set_i_particle_scales_from_real_value_(clusterid, address, acc, jerk, phi, jfactor, ffactor);
}
    
void create_one_i_particle(int clusterid,
			   int address,
			   int index,
			   double x[3], /* position */
			   double v[3], /* velocity */
			   double eps2,
			   double h2,
			   unsigned int ippacket[13] )
{
    ULONG ix[3];
    LONG  ixsigned;
    int k,ii;
    SHORTPACK pfv[3],pfeps2, pfh2;
    GRAPE6_CLUSTER_PTR cp = clusters[clusterid];
    ii = address;
    for(k=0;k<3;k++){
#ifndef X86
	ix[k] = CONVERT_DOUBLE_TO_GRAPE_INT_POS(x[k],xunit);
#else
	ixsigned = x[k]*xscale;
	ix[k] = (ULONG) ixsigned;
#endif
	pfv[k].f = (float)(v[k]*vscale);
    } 
    pfeps2.f = (float)(eps2)*xscale2;
    pfh2.f = (float)(h2)*xscale2;
    
    ippacket[0] = ix[0]>>32;
    ippacket[1] = ix[0];
    ippacket[2] = ix[1]>>32;
    ippacket[3] = ix[1];
    ippacket[4] = ix[2]>>32;
    ippacket[5] = ix[2];
    ippacket[6] = pfv[0].i32;
    ippacket[7] = pfv[1].i32;
    ippacket[8] = pfv[2].i32;
    ippacket[9] = pfeps2.i32;
    ippacket[10] = pfh2.i32;
    ippacket[11] = index;
    ippacket[12] = ((0x3ff&(cp->phiscale[ii]))<<20) | ((0x3ff&(cp->fscale[ii]))<<10) | (0x3ff&(cp->jscale[ii]));
#ifdef SIMULATOR
    if (cp->simg6p->use_simulator){
	 set_i_particle_data_on_emulator(cp->simg6p,address,x,v,eps2,h2,
					 global_rscale, index,
					 cp->fscale[ii],cp->jscale[ii],cp->phiscale[ii]);
    }	 
#endif	
#ifdef INTERNAL_OUT
    fprintf(stderr,"create_one_i_particle, %d %d %d %g %g\n", clusterid, address, index,
	    eps2, h2);
    fprintf(stderr,"x %g %g %g %8x%8x %8x%8x %8x%8x\n",x[0],x[1],x[2],
 	    ippacket[0],ippacket[1],ippacket[2],ippacket[3],ippacket[4],ippacket[5]);
    fprintf(stderr,"v %g %g %g %x %x %x\n",v[0],v[1],v[2], ippacket[6], ippacket[7], ippacket[8]);
    fprintf(stderr,"eps, h2, scales %x %x %x %x %x\n",ippacket[9], ippacket[10], cp->phiscale[ii], cp->fscale[ii], cp->jscale[ii]);
#endif    
}

void g6_set_i_particle_(int * clusterid,
			int *address,
			int *index,
			double x[3], /* position */
			double v[3], /* velocity */
			double * eps2,
			double * h2)
{
    unsigned int ipdata[20];
    ULONG ix[3];
    LONG  ixsigned;
    int k,ii;
    float fv[3],feps2,fh2;
    GRAPE6_CLUSTER_PTR cp = clusters[*clusterid];

    create_one_i_particle(*clusterid,*address,*index, x, v, *eps2,*h2,ipdata+2);
    ii = *address;
    ipdata[0] = ii<<4;
#ifndef DMAxxx
    ipdata[1] = 0xd;                    
    ipdatawrite(clusters[*clusterid]->ipboardid,ipdata);
#else    
    ipdata[1] = 0x10;                    
    if (ii >= 47) ipdata[1] = 0xd;
    ipdatawrite_dma(*clusterid,ipdata);
#endif
    if (jpbuf_dump_mode){
	fprintf(stderr,"g6_set_i_particle, %d %d %d %g %g\n", *clusterid, *address, *index,
		*eps2, *h2);
	fprintf(stderr,"x %g %g %g %8x%8x %8x%8x %8x%8x\n",x[0],x[1],x[2],
		ipdata[2],ipdata[3],ipdata[4],ipdata[5],ipdata[6],ipdata[7]);
	fprintf(stderr,"v %g %g %g %x %x %x\n",v[0],v[1],v[2], ipdata[8], ipdata[9], ipdata[10]);
	fprintf(stderr,"eps, h2, scales %x %x %x %x %x\n",ipdata[11], ipdata[12], cp->phiscale[ii], cp->fscale[ii], cp->jscale[ii]);
    }
}

void g6_set_i_particle_vector_(int * clusterid,
			       int *start_address,
			       int *count,
			       int index[],
			       double x[][3], /* position */
			       double v[][3], /* velocity */
			       double eps2[],
			       double h2[])
{
    static    unsigned int ipdata[1024];
    int i,ii;
    GRAPE6_CLUSTER_PTR cp = clusters[*clusterid];
    /*    fprintf(stderr,"vector called with %d\n", *count);*/
    ipdata[0] = (*start_address)<<4;
    ipdata[1] = 13*(*count);                    
    for(i=0; i< *count; i++){
	ii = *start_address+i;
	create_one_i_particle(*clusterid,ii,index[i], x[i], v[i], eps2[i],h2[i ],
			      ipdata+2+i*13 );
    }
    if (cp->ijpdma){
	ipdatawrite_dma(*clusterid,ipdata);
    }else{ 
	ipdatawrite(clusters[*clusterid]->ipboardid,ipdata);
    }
    if (jpbuf_dump_mode){
	fprintf(stderr,"ipdata 0, 1 = %x %d\n", ipdata[0],ipdata[1]);
	for(i=0;i<13*(*count);i++){
	    fprintf(stderr," %8x",ipdata[i+2]);
	    if ( (i%13)==12) fprintf(stderr,"\n");
	}
#if 0	
	fprintf(stderr,"x %g %g %g %8x%8x %8x%8x %8x%8x\n",x[0][0],x[0][1],x[0][2],
		ipdata[2],ipdata[3],ipdata[4],ipdata[5],ipdata[6],ipdata[7]);
	fprintf(stderr,"v %g %g %g %8x %8x %8x\n",v[0][0],v[0][1],v[0][2],
		ipdata[8],ipdata[9],ipdata[10]);
	fprintf(stderr,"eps, h2, scales %x %x %x %x %x\n",ipdata[11], ipdata[12], cp->phiscale[ii], cp->fscale[ii], cp->jscale[ii]);
#endif	
    }
}


static unsigned int fodata[NCLUSTERS][1024];

void dump_fodata(int clusterid, int nwords)
{
    int i;
    fprintf(stderr,"Dumping FODATA\n");
    for(i=0;i<nwords; i++){
	if ( (i%7)==0 )fprintf(stderr,"%4d: ", i);
	fprintf(stderr," %8x ", fodata[clusterid][i]);
	if ( (i%7)==6 )fprintf(stderr,"\n");
    }
    if ( (nwords%7) !=0 )fprintf(stderr,"\n");
}

void dump_dmawdata(int clusterid, int nwords)
{
    int i;
    fprintf(stderr,"Dumping DMADATA\n");
    for(i=0;i<nwords; i++){
	if ( (i%7)==0 )fprintf(stderr,"%4d: ", i);
	fprintf(stderr," %8x ", dma_work_area[clusterid][i]);
	if ( (i%7)==6 )fprintf(stderr,"\n");
    }
    if ( (nwords%7) !=0 )fprintf(stderr,"\n");
}
int compare_double_results(int clusterid, int ni)
{
    int i, nwords;
    nwords = ni*14;
    for(i=0;i<nwords; i++){
	if (dma_work_area[clusterid][i] !=dma_work_area[clusterid][i+nwords]){
	    fprintf(stderr,"double check error at %d  %d%x != %x\n",
		    i, i+nwords,dma_work_area[clusterid][i],
		    dma_work_area[clusterid][i+nwords]);
	    dump_dmawdata(clusterid, nwords*2);
	    return 1;
	}
    }
    return 0;
}

void g6_setfoerrchkmode_(int * clusterid, int * mode)
{
    GRAPE6_CLUSTER_PTR cp;
    cp = clusters[*clusterid];
    cp->fofferror_check_mode = *mode;
}

	
int look_for_ff_results(int clusterid, int ni)
{
    int i, nwords;
    GRAPE6_CLUSTER_PTR cp;
    cp = clusters[clusterid];
    if (cp->fofferror_check_mode == 0) return 0;
    nwords = ni*14;
    for(i=0;i<nwords; i++){
	if (dma_work_area[clusterid][i] == 0xffffffff){
	    if (i != cp->fofferror_location){
		fprintf(stderr,"look_for_ff error at %d  %x\n",
			i, dma_work_area[clusterid][i]);
		dump_dmawdata(clusterid, nwords);
		cp->fofferror_location = i;
		return 1;
	    }else{
		return 0;
	    }
	}
    }
    cp->fofferror_location = -1;
    return 0;
}

void dump_foworkdata(int clusterid, int nwords)
{
    int i;
    fprintf(stderr,"Dumping DMADATA\n");
    for(i=0;i<nwords; i++){
	if ( (i%7)==0 )fprintf(stderr,"%4d: ", i);
	fprintf(stderr," %8x ", fo_work_area[clusterid][i]);
	if ( (i%7)==6 )fprintf(stderr,"\n");
    }
    if ( (nwords%7) !=0 )fprintf(stderr,"\n");
}

void shift_dmawdata(int clusterid, int nwords, int ishift)
{
    int i;
    fprintf(stderr,"Dumping DMADATA\n");
    for(i=0;i<nwords; i++){
	dma_work_area[clusterid][i] = 	dma_work_area[clusterid][i+ishift];
    }
}

    
	


int g6_get_force_etc_(int * clusterid,
		      double acc[][3],
		      double jerk[][3],
		      double phi[],
		      int nnbindex[],
		      int flag[])
{
    double ascale,pscale,jscale;
    DATAPACK scalep;
    LONG * lp;
    int i,adr,k;
    LONG iphi;
    int * ip;
    int ij[3];
    int nihard, nisoft;
    GRAPE6_CLUSTER_PTR cp = clusters[*clusterid];

    int iret = 0;
    int iexp;
    ULONG iscale;
    unsigned int * foptr;
    nisoft = cp->ni;
    nihard = nisoft;
#ifdef DOUBLE_CHECK_RESULT
    nihard = nisoft * 2;
#endif
    foptr = dma_work_area[*clusterid];
#if REALHARD    
    if (fodatawait(cp->foboardid, nihard*5)){
	/* note that, in the above test, second argument  ni*9
	   implies that the above function actually returns
	   when 9/14 of the data was sent. The rest is expected
	   to come along while the DMA is ongoing.

	   THIS IS HIGHLY UNSAFE ROUTINE

	   J. Makino 2000/4/10

	   */
	return 1;
    }

#ifdef DMA
    {
	int ipio = 1;
	if (ipio > nihard) ipio = nihard;
	MB;
	if (ipio < nihard){
	    g6hib_dmaw(clusters[*clusterid]->foboardid,clusters[*clusterid]->dma_offset,
		       nihard*14);
	    uwait((nihard*14)/33);
	    while(g6hib_dma_status(clusters[*clusterid]->foboardid)){
		g6_dummywait();
		uwait(10);
	    }
#ifdef DMA_BROKEN_FIRSTWORD
	    dma_work_area[*clusterid][0]= fo_work_area[*clusterid][0];
#endif	    
	}else{
	    fodataread(clusters[*clusterid]->foboardid, nihard*14,
		       fo_work_area[*clusterid]);
	    for(i=0; i<nihard*14; i++)
		dma_work_area[*clusterid][i] = fo_work_area[*clusterid][i];
	}
    }
    
#else
    fodataread(cp->foboardid, nihard*14, dma_work_area[*clusterid]);
#endif    
    if (g6hib_getcounter(clusters[*clusterid]->foboardid) != nihard*14){
	fprintf(stderr,"get_force: data count failure %d %d\n",
		g6hib_getcounter(clusters[*clusterid]->foboardid), nihard*14);
	iret= 1;
    }

#ifdef DMA
    if (look_for_ff_results(*clusterid, nisoft)) iret= 1;
#endif
#ifdef DOUBLE_CHECK_RESULT
    if (compare_double_results(*clusterid, nisoft)) iret= 1;
#endif
    if (iret) {
	fprintf(stderr, "get_force: error return\n");
	return iret;
    }
    if(iret = g6hib_foerror(cp->foboardid, nihard)){
	fprintf(stderr, "g6hib_foerror: error return %d\n",iret);
	g6hib_foerror_clear(cp->foboardid);
	return iret;
    }
#else
    for(i=0;i<nisoft;i++){
	adr = i*14;
	if (cp->simg6p->use_simulator){
	    unsigned int sim_fodata[14];
	    int k, error;
	    construct_sim_fodata(sim_fodata, &(cp->simg6p->reduced_result[i]));
	    for(k = 0; k<14;k++){
		foptr[adr+k]  = sim_fodata[k];
	    }
	}
    }
#endif /*REALHARD*/

    for(i=0;i<nisoft;i++){
	
	adr = i*14;
	lp = (LONG*) (foptr+adr);


	/* note that word order is assumed to be CORRECT here ... */

	iexp = 0x3ff-(cp->fscale[i]-xunit*2+512);
	scalep.i64 = (((ULONG) iexp) & 0x7ff) <<52;
#ifdef INTERNAL_OUT
	fprintf(stderr,"acc scale =  %e\n",  ascale);
#endif	
	for(k=0;k<3;k++){
	    acc[i][k]= (*(lp+k))*scalep.d;
	}
	
	iexp = 0x3ff-(cp->phiscale[i]-xunit+512);
	//	iscale = (((ULONG) iexp) & 0x7ff) <<52;
	//	pscale = *((double *)&iscale);
	//  for some unknown reason, gcc 2.96 and 3.x with -O2 or higher
	//  does not like the above
	//	*((ULONG*)(&pscale)) = (((ULONG) iexp) & 0x7ff) <<52;
	scalep.i64 = (((ULONG) iexp) & 0x7ff) <<52;
	phi[i] = -(*(lp+3))*scalep.d;
#ifdef INTERNAL_OUT
	fprintf(stderr,"pot scales =  %e\n",  pscale);
#endif	

	ip = (int *) (foptr+adr+8);
	ij[0] = foptr[adr+8];
	ij[1] = foptr[adr+9];
	ij[2] = foptr[adr+10];
	iexp = 0x3ff-(cp->jscale[i]-xunit*2-tunit+512);
	//	iscale = (((ULONG) iexp) & 0x7ff) <<52;
	//	jscale = *((double *)&iscale);
	//	*((ULONG*)(&jscale)) = (((ULONG) iexp) & 0x7ff) <<52;
	scalep.i64 = (((ULONG) iexp) & 0x7ff) <<52;
#ifdef INTERNAL_OUT
	fprintf(stderr,"jerk scales =  %e\n",  jscale);
#endif	
	for(k=0;k<3;k++){
	    jerk[i][k] =  ((double)(*(ip+k)))*scalep.d;
#ifdef INTERNAL_OUT
	    fprintf(stderr,"jerk ik = %d %d %x %e\n", i,k, *(ip+k),jerk[i][k]);
#endif	
	}
	nnbindex[i] = foptr[adr+12];
	flag[i] = foptr[adr+13];
	if(flag[i] & (CERRFLAGMASK)){
	    struct grape6_errorcounters * ecp = &(cp->errorcounters);
	    if(flag[i] & (JPERRFLAGMASK)) ecp->jpperr_count++;
	    if(flag[i] & (IPERRFLAGMASK)) ecp->ipperr_count++;
	    if(flag[i] & (ECCCORRFLAGMASK)) ecp->memecc_count++;
	    if(flag[i] & (ECCERRFLAGMASK)) ecp->memuncorrect_count++;
	    iret = 1;
	}
	
#ifdef SIMULATOR
	if (cp->simg6p->use_simulator){
	    unsigned int sim_fodata[14];
	    int k, error;
	    construct_sim_fodata(sim_fodata, &(cp->simg6p->reduced_result[i]));
	    error = 0;
	    for(k = 0; k<14;k++){
		if (foptr[adr+k]  != sim_fodata[k]){
		    error ++;
		    fprintf(stderr,"(g6 get force) error at %d %d\n",i, k);
		    iret = 1;
		}
	    }
	    error = error + cp->simg6p->simulator_verbose_level;
	    if (error > 0){
		fprintf(stderr,"(g6 get force) i = %d\n", i);
		for(k = 0; k<14;k++){
		    fprintf(stderr,"%4d G:%8x H:%8x %8x\n", k, foptr[adr+k],sim_fodata[k],
			    foptr[adr+k] ^sim_fodata[k]);
		}
	    }
	}
#endif /*SIMULATOR*/    
    }
    return iret;
}



int g6_get_force_(int * clusterid,
		   double acc[][3],
	           double jerk[][3],
	           double phi[],
	           int flag[])
{
    static int nnbindex[NCLUSTERS][MAXPIPELINESPERCHIP+12];
    return g6_get_force_etc_(clusterid, acc, jerk, phi,nnbindex[*clusterid],flag);
}


void read_and_compare_result(int board,
			     unsigned int  fodatabuf[],
			     unsigned int fodatacount)
{
    unsigned int g6data[MAXBURST];
    unsigned int ng6data;
    int i;
    
    ng6data = g6hib_getcounter(board);
    fprintf(stderr,"(read and compare results, ndata = host:%d g6:%d\n", fodatacount, ng6data);
    if (ng6data > MAXBURST) ng6data = MAXBURST;
    linkread(board,ng6data,g6data);
    for(i = 0; i<fodatacount;i++){
	fprintf(stderr,"%4d H:%8x G:%8x %8x\n", i, fodatabuf[i],g6data[i],
		fodatabuf[i] ^g6data[i]);
    }
}

/*
  g6_send_testpattern
  file format: chiptest pattern file
 */
int g6_send_testpattern(int board,
			char * file_name)
{
    FILE * fin;
    char linebuf[1024];
    unsigned int databuf[8192];
    unsigned int datacount;
    unsigned int linkprev;
    int data, code;
    unsigned int fodatabuf[8192];
    unsigned int fodatacount = 0;
    unsigned int fostate = 0;

    /*see if the data file name is set */
    if(file_name == NULL) return -1;
#if INTERNAL_OUT0
    printf("(g6_send_testpattern) file to open: %s\n", file_name);
#endif
    /* see if data file exists */
    fin = fopen(file_name,"r");
    if(fin == NULL){
	fprintf(stderr,"(g6_send_testpattern) failed to open defect file %s\n",
		file_name);
	return -1;
    }
    /* data file exists*/
    
    datacount = 0;
    linkprev = -1;
    counterclear(board);
    g6hib_printcounter(board);
    while(fgets(linebuf, 1023,fin)!=NULL){
	unsigned long ldata;
	unsigned int  data;
	int link;
	char header[128], header2[128];
	sscanf(linebuf, "%s %lx", header, &ldata);
	data = (unsigned int) (ldata & 0xffffffff);
	link = -1;
	if(strcmp("IPHST", header)==0){
	    link = IPLINK;
	}else	if(strcmp("JPHST", header)==0){
	    link = JPLINK;
	}

	if ((link == -1) && (datacount > 0)){
	    linkwrite(board,linkprev, datacount,databuf);
	    datacount = 0;
	}
	if ((link == IPLINK) ||(link == JPLINK)){
	    if (link != linkprev){
		if ((linkprev != -1) && (datacount > 0)){
		    linkwrite(board,linkprev, datacount,databuf);
		}
		linkprev = link;
		datacount = 0;
	    }
	    if ((link == JPLINK) && (data == 0xffc00)) data = 0;
#if INTERNAL_OUT0
	    fprintf(stderr,"Sending data %x to Link %x --- %s", data, link, linebuf);
#endif
	    databuf[datacount] = data;
	    datacount ++;
	    /*	    linkwrite_oneword(board, link, data);*/
	}
	    
	if(strcmp("FOHST", header)==0){
#if INTERNAL_OUT0
	    fprintf(stderr,"Expected  data: %s", linebuf);
#endif
	    fostate = 1;
	    if((linebuf[16] == '0')&&(linebuf[17] == '0')){
		fodatabuf[fodatacount] = data;
		fodatacount ++;
#if INTERNAL_OUT0
		fprintf(stderr,"Valid expected  data: %s", linebuf);
#endif
	    }else{
#if INTERNAL_OUT0
		fprintf(stderr,"Invalid expected  data: %s", linebuf);
#endif
	    }
	}else{
	    if(fostate == 1){
		read_and_compare_result(board, fodatabuf, fodatacount);
		counterclear(board);
		fodatacount = 0;
		fostate = 0;
	    }
	}

	
    }
    if ((linkprev != -1) && (datacount > 0)){
	linkwrite(board,linkprev, datacount,databuf);
    }
    if(fostate == 1){
	read_and_compare_result(board, fodatabuf, fodatacount);
	counterclear(board);
	fodatacount = 0;
    }
    fclose(fin);
    return 0;
}

#define NDIM 3
void g6_guestimate_acc_etc( int n,
			 double eps2,
			 double m[],
			 double a[][NDIM],
			 double j[][NDIM],
			 double p[])
     /* this routine is for GRAPE-6. Gives some order-estimated values
	force etc */
{
    int i,k;
    double fscale = m[0]/eps2*sqrt(n+0.0);
    double pscale = m[0]/sqrt(eps2)*sqrt(n+0.0);
    if (fscale < 10.0) fscale = 10.0;
    if (pscale < 10.0) pscale = 10.0;
    for(i=0;i<n;i++){
	p[i] = pscale;
	for(k=0;k<NDIM;k++){
	    a[i][k] = fscale;
	    j[i][k] = fscale;
	}
    }
}


static set_test_mode(int mode, int * flag)
{
/* change flag if mode is 0 or positive */
    if (mode > 0){
	*flag = 1;
    }else if (mode == 0){
	*flag = 0;
    }
}


static test_force_overflow_flag = 1;
static test_jerk_overflow_flag = 1;
static test_pot_overflow_flag = 1;

void g6_set_overflow_flag_test_mode(int force_test_mode,
				    int jerk_test_mode,
				    int pot_test_mode)
{
    set_test_mode(force_test_mode,&test_force_overflow_flag);
    set_test_mode(jerk_test_mode, &test_jerk_overflow_flag);
    set_test_mode(pot_test_mode,  &test_pot_overflow_flag);
}

void g6_set_overflow_flag_test_mode_(int *force_test_mode,
				    int *jerk_test_mode,
				    int *pot_test_mode)
{
     g6_set_overflow_flag_test_mode(*force_test_mode,
				    *jerk_test_mode,
				    *pot_test_mode);
}

void g6_get_overflow_flag_test_mode_(int *force_test_mode,
				    int *jerk_test_mode,
				    int *pot_test_mode)
{
    *force_test_mode=test_force_overflow_flag;
    *jerk_test_mode= test_jerk_overflow_flag;
    *pot_test_mode=  test_pot_overflow_flag;
}

void g6_get_overflow_flag_test_mode(int *force_test_mode,
				    int *jerk_test_mode,
				    int *pot_test_mode)
{
    g6_get_overflow_flag_test_mode_(force_test_mode,
				    jerk_test_mode,
				    pot_test_mode);
}
	

    

int g6_test_flag_(int * clusterid, int * flagp)
{
    int flag;
    flag = *flagp;
    if(flag & (CERRFLAGMASK)){
	fprintf(stderr,"(g6_test_flag) communication error  %x -- abort\n",
		flag);
	return COMMERROR;
    }
    if(flag & (~ TESTFLAGMASK)){
#if 1
	fprintf(stderr,"(g6_test_flag) scaling error %x\n",flag);
	/*	dump_dmawdata(*clusterid, 48*14);*/
#endif	
	if((flag & FORCEFLAGMASK) && test_force_overflow_flag)
	    return FORCESCALINGERROR; 
	if((flag & JERKFLAGMASK) && test_jerk_overflow_flag)
	    return  JERKSCALINGERROR; 
	if((flag & POTFLAGMASK) && test_pot_overflow_flag)
	    return   POTSCALINGERROR; 
    }
    return 0;
}

void g6_adjust_ip_scales_(int * clusterid, int *address, int * flagp)
{
    int i, flag;
    GRAPE6_CLUSTER_PTR cp = clusters[*clusterid];
    flag = *flagp;
    i = *address;
    if(flag & FORCEFLAGMASK){
	cp->fscale[i] -= 5;
	fprintf(stderr, "(adjust_ip_scales) decrimenting scale for force %d\n",
		cp->fscale[i]);
    }
    if(flag & JERKFLAGMASK) {
	cp->jscale[i] -= 5;
	fprintf(stderr, "(adjust_ip_scales) decrimenting scale for jerk%d\n",
		cp->jscale[i]);
    }
    if(flag & POTFLAGMASK) {
	cp->phiscale[i] -=5;
	fprintf(stderr, "(adjust_ip_scales) decrimenting scale for pot%d\n",
		cp->phiscale[i]);
    }
}



int g6_npipes()
{
#ifndef     DOUBLE_CHECK_RESULT
    return MAXPIPELINESPERCHIP;
#else
    return (MAXPIPELINESPERCHIP/2);
#endif    
    
}

int g6_npipes_()
{
    return g6_npipes();
}

//#define NBDEBUG 1

static void read_neighbour_list(int clusterid, int mask, int chipid)
{
  unsigned int ipdata[4];

  ipdata[0] = IPW_FO_INTERNAL_ADDRESS|FO_CMD0_ADR;
  ipdata[1] = 0x2;
  ipdata[2] = (mask << 10) | chipid;
  ipdata[3] = 0;
#ifdef INTERNAL_OUT
  fprintf(stderr,"(read_neighbour_list) clusterid, mask, chipid  = %x %x %x\n",
	  clusterid, mask, chipid);
#endif
  ipdatawrite(clusters[clusterid]->ipboardid,ipdata);
}


static void read_random_word(int clusterid, int mask, int chipid,int address)
{
  unsigned int ipdata[4];

  ipdata[0] = IPW_FO_INTERNAL_ADDRESS|FO_CMD0_ADR;
  ipdata[1] = 0x2;
  ipdata[2] = (1<<30)|(mask << 10) | chipid;
  ipdata[3] = address;
#ifdef INTERNAL_OUT
  fprintf(stderr,"(read_neighbour_list) clusterid, mask, chipid  = %x %x %x\n",
	  clusterid, mask, chipid);
#endif
  ipdatawrite(clusters[clusterid]->ipboardid,ipdata);
}


static int store_neighbour_list(int ci, int ichip,
				 int nbmcount[3], unsigned int nbmem[3][512])
     /* return code: internal error = -1, some unit overflow = 1, else 0 */
     
{
    int imem, boardid;
    unsigned int * dprambase, *ramp;
    int overflown;
    boardid = clusters[ci]->foboardid;
    while (g6hib_getcounter(clusters[ci]->foboardid)<1){
#ifdef INTERNAL_OUT
	fprintf(stderr,"counter = %x\n",g6hib_getcounter(clusters[ci]->foboardid));
#endif	
	MB;
    }
#ifdef INTERNAL_OUT    
    fprintf(stderr,"Counter = %x\n",g6hib_getcounter(clusters[ci]->foboardid));
#endif
    dprambase = ((unsigned int *)hib[boardid]) + (DPRAMBASE);


    /* slow but simple version: read the list word-by-word */
    
    for (imem =  0, ramp = dprambase; imem <3; imem ++){
	int k,nwords;
	unsigned int firstword;
	int unitid, chipid;
	firstword = *ramp;
#ifdef NBDEBUG	
	fprintf(stderr,"store nbl, chip, imem, fst = %x %x %x\n",
		ichip, imem, firstword);
#endif	
	nbmcount[imem] = (int) (firstword & 511);
	unitid = (firstword>>10)&3;
	chipid = (firstword>>12)&511;
	if ((unitid != imem) ||(chipid != ichip)  ){
	    fprintf(stderr,"store_neighbour_list: internal error, inconsistent first word %x %x %x\n",
		    firstword, ichip, imem);
	    return -1;
	}
	nwords = nbmcount[imem]*2;
#ifdef INTERNAL_OUT    
	fprintf(stderr, "store neighbour list: imem=%x,   count=%x\n",
		imem, nbmcount[imem]);
#endif
	ramp++;
	for(k = 0;k<nwords; k++, ramp++){
	    nbmem[imem][k] = *ramp;
#ifdef NBDEBUG	    
	    fprintf(stderr," nbmem at %x %x = %x\n", imem, k, nbmem[imem][k]);
#endif	    
	}
	overflown = 0;
	for(k = 0;k<nwords; k+=2) overflown |= (nbmem[imem][k]>>16);
#ifdef INTERNAL_OUT    
	for(k = 0;k<nwords; k+=2){
	    fprintf(stderr," %8x %8x", nbmem[imem][k],nbmem[imem][k+1]);
	    if ( (k %8) == 6)fprintf(stderr,"\n");
	}
	fprintf(stderr,"\n");
#endif
    }
    return overflown;
}    


void test_nb_read_multi()
{
    counterclear(0);
    MB;
    read_neighbour_list(0,0x380,0);
    usleep(100000);
    fprintf(stderr,"Counter = %x\n",g6hib_getcounter(0));
    dpramdump(0,30);
#if 0
    counterclear(0);
    MB;
    read_neighbour_list(0,0x3e0,64);
    usleep(100000);
    dpramdump(0,30);

    counterclear(0);
    MB;
    read_neighbour_list(0,0x3f0,96);
    usleep(100000);
    dpramdump(0,30);

    counterclear(0);
    MB;
    read_neighbour_list(0,0x3f0,112);
    usleep(100000);
    dpramdump(0,30);

    counterclear(0);
    MB;
    read_neighbour_list(0,0x3fc,120);
    usleep(100000);
    dpramdump(0,30);
#endif
}

static int nbmcount[NCLUSTERS][MAXCHIPSPERCLUSTER][3];
static unsigned int nbmem[NCLUSTERS][MAXCHIPSPERCLUSTER][3][512];

#define MAX_TRANSFER_LENGTH 1024

static int store_neighbour_list_multiple(int ci)
     /* return code: internal error = -1, some unit overflow = 1, else 0 */
     
{
    static int linearnbmem[MAXCHIPSPERCLUSTER*3*512];
    int imem, boardid, nwords, newmem,k;
    unsigned int * dprambase, *ramp;
    int overflown;
    boardid = clusters[ci]->foboardid;
    while (g6hib_getcounter(clusters[ci]->foboardid)<1){
#ifdef INTERNAL_OUT
	fprintf(stderr,"counter = %x\n",g6hib_getcounter(clusters[ci]->foboardid));
#endif	
	MB;
    }
    do{
	nwords = g6hib_getcounter(clusters[ci]->foboardid);
    }while (nwords != g6hib_getcounter(clusters[ci]->foboardid));
#ifdef NBDEBUG
    fprintf(stderr,"Counter = %x %x\n",nwords,
	    g6hib_getcounter(clusters[ci]->foboardid));
#endif
    dprambase = ((unsigned int *)hib[boardid]) + (DPRAMBASE);

    /* first copy data into linearnbmem. This version does not
       use DMA yet */
    /*    for(k=0;k<nwords;k++) linearnbmem[k]= *(dprambase+k);*/
    for(k=0;k<nwords; k+= MAX_TRANSFER_LENGTH){
	int length = nwords - k;
	if (length > MAX_TRANSFER_LENGTH)length = MAX_TRANSFER_LENGTH;
#ifndef DMA	
	linkread_with_offset(clusters[ci]->foboardid,k,length,linearnbmem+k);
#else	
	linkread_dma_with_localoffset(clusters[ci]->foboardid,length,
				      k,linearnbmem+k);
#endif
    }
    

    /* then interpret the content */
    overflown = 0;

    for(k=0,newmem=1;k<nwords;k++){
	unsigned int firstword;
	int unitid, chipid,nmem,imem;
	if(newmem){
	    firstword = linearnbmem[k];
	    unitid = (firstword>>10)&3;
	    chipid = (firstword>>12)&511;
	    nmem = (int) (firstword & 511);
	    nbmcount[ci][chipid][unitid]=nmem;
	    imem = 0;
	    if (nmem > 0)newmem = 0;
	}else{
	    nbmem[ci][chipid][unitid][imem]=linearnbmem[k];
	    if((imem & 1) == 0)overflown |= (linearnbmem[k]>>16);
	    imem++;
	    if(imem == nmem*2) newmem = 1;

	}
    }
    return overflown;
}    


/*
 * g6_read_neighbour_list_old : read the neighbour lists of all chips
 * and store them to the local strage of the library
 */
int g6_read_neighbour_list_old_(int * clusterid)
{
    GRAPE6_CLUSTER_PTR cp = clusters[*clusterid];
    int i, ichip, ci;
    int overflown;
    ci = *clusterid;
    /*
     * The simplest version --- read one chip at a time
     */
    overflown = 0;
    for (i = 0; i < (cp->nchips); i ++){
	int iret;
	counterclear(cp->foboardid);
	MB;
	ichip = cp->realchips[i];
	read_neighbour_list(ci, 0x3ff, i);
	MB;
	iret = store_neighbour_list(ci, i, nbmcount[ci][i], nbmem[ci][i]);
	if (iret < 0) return iret;
	overflown |= iret;
    }
    return overflown;
}

/*
 * g6_read_neighbour_list : read the neighbour lists of all chips
 * and store them to the local strage of the library
 * this is (hopefully) faster than original version (old)
 */
#define MAX_NBL_CHIPS 16

int g6_read_neighbour_list_(int * clusterid)
{
    GRAPE6_CLUSTER_PTR cp = clusters[*clusterid];
    int i, ichip, ci;
    int overflown;
    ci = *clusterid;

    overflown = 0;
    for (i = 0; i < (cp->nchips); i += MAX_NBL_CHIPS){
	int iret;
	int mask = MAX_NBL_CHIPS;
	if (mask > cp->maxchips) mask = cp->maxchips;
	mask = (~(mask - 1))&0x3ff ;
	
	counterclear(cp->foboardid);
	MB;
	ichip = cp->realchips[i];
	read_neighbour_list(ci, mask, i);
	MB;
	iret = store_neighbour_list_multiple(ci);
	if (iret < 0) return iret;
	overflown |= iret;
    }
    return overflown;
}


int g6_print_chip_status_(int * clusterid)
{
    GRAPE6_CLUSTER_PTR cp = clusters[*clusterid];
    int i, ichip, ci;
    ci = *clusterid;
    g6_reset_chip_vcids(* clusterid);

    fprintf(stderr,"Chip status for cluster %d\n", *clusterid);
    for (i = 0; i < (cp->maxchips); i ++){
	static unsigned int databuf[128];
	int iret;
	counterclear(cp->foboardid);
	MB;
	ichip = cp->realchips[i];
	read_random_word(ci, 0x3ff, i, 0xd);
	MB;
	if (fodatawait(clusters[ci]->foboardid,2) == 0){
	    fodataread(clusters[ci]->foboardid,2,databuf);
	    fprintf(stderr,"%3d %8x\n", i, databuf[0]);
	}else{
	    fprintf(stderr,"%3d NO REPLY\n", i);
	    g6reset_pbonly(cp->ipboardid);
	}
    }
    return 0;
}

int g6_print_chip_status(int clusterid)
{
    return  g6_print_chip_status_(&clusterid);
}

int g6_read_neighbour_list(int clusterid)
{
    return g6_read_neighbour_list_(&clusterid);
}

int g6_read_neighbour_list_old(int clusterid)
{
    return g6_read_neighbour_list_old_(&clusterid);
}

int compare(i,j)
    int *i;
    int *j;
{
    return *i - *j;
}

int  g6_get_neighbour_list_(int * clusterid, int * ipipe,
			   int * maxlength, int * nblen,
			   int nbl[])
{
    GRAPE6_CLUSTER_PTR cp = clusters[*clusterid];
    int ip, imem, mask;
    int i, k;
    ip = *ipipe % 16;
    imem = *ipipe/16;
    mask = 1 << ip;
    *nblen =  0;
    for (i = 0; i < (cp->nchips); i ++){
	for(k=0; k< nbmcount[*clusterid][i][imem]; k++){
	    if (nbmem[*clusterid][i][imem][k*2] & mask){
		nbl[*nblen] = nbmem[*clusterid][i][imem][k*2+1];
		(*nblen) ++;
		if( *nblen == *maxlength) return 1;
	    }
	}
    }
    qsort(nbl, *nblen, sizeof(int), compare);

    return 0;
}
    


int  g6_get_neighbour_list(int clusterid, int ipipe,
			   int maxlength, int * nblen,
			   int nbl[])
{
    return g6_get_neighbour_list_(&clusterid, &ipipe,
				  &maxlength, nblen,nbl);
}

void g6calc_firsthalf0_(int * clusterid,
			int * nj,
			int * ni,
			int index[],
			double xi[][3],
			double vi[][3],
		      double fold[][3],
			double j6old[][3],
			double phiold[],
			double eps2[],
			double h2[],
			int * mode)
     /* mode: 0 --- standard mode, regard both eps2 and h2 as array
	      1 --  GRAPE-4 compatibility mode, eps2 is scalar
	      2 (bit loc 1) skip send_njp... future extension */
{
    int ii, ieps, dieps, ii2, ni2;
    GRAPE6_CLUSTER_PTR cp = clusters[*clusterid];
    dieps = 1;
    if ((*mode) & 1) dieps = 0;
    ieps = 0;
    if (cp->ijpdma == 0){
	for (ii= 0; ii< *ni; ii++){
	    double six = 6.0;
	    double zero = 0.0;
	    if (sbufset == 0){
		g6_set_i_particle_scales_from_real_value_(clusterid, &ii,  fold[ii],
							  j6old[ii], phiold+ii,
							  &six,&zero);
	    }else{
		g6_copy_i_particle_scales_(clusterid, &ii, fsbuf[*clusterid],
					   jsbuf[*clusterid],psbuf[*clusterid]);
	    }
	    g6_set_i_particle_(clusterid, &ii, index+ii, xi[ii], vi[ii],
			       eps2+ieps, h2+ii);
#ifdef DOUBLE_CHECK_RESULT
	    ii2 = ii+ *ni;
	    g6_set_i_particle_scales_from_real_value_(clusterid, &ii2,  fold[ii],
						      j6old[ii], phiold+ii,
						      &six,&zero);
	    g6_set_i_particle_(clusterid, &ii2, index+ii, xi[ii], vi[ii],
			       eps2+ieps, h2+ii);
#endif
	    ieps += dieps;
	}
    }else{
	double eps2array[48];
	int izero = 0;
	if ((*mode)&1){
	    for (ii= 0; ii< *ni; ii++) eps2array[ii] = eps2[0];
	}else{
	    for (ii= 0; ii< *ni; ii++) eps2array[ii] = eps2[ii];
	}
	
	for (ii= 0; ii< *ni; ii++){
	    double six = 6.0;
	    double zero = 0.0;
	    if (sbufset == 0){
		g6_set_i_particle_scales_from_real_value_(clusterid, &ii,  fold[ii],
							  j6old[ii], phiold+ii,
							  &six,&zero);
	    }else{
		g6_copy_i_particle_scales_(clusterid, &ii, fsbuf[*clusterid],
					   jsbuf[*clusterid],psbuf[*clusterid]);
	    }
	}
	g6_set_i_particle_vector_(clusterid, &izero,ni, index, xi, vi, eps2array,
				  h2);
    }
    g6_set_nip_(clusterid, ni);
    if (((*mode) & 2) == 0) {
	g6_set_njp_(clusterid, nj);
    }else{
	g6_set_njp_real_(clusterid, nj);
    }
	
}


void g6calc_firsthalf0(int clusterid,
		       int nj,
		       int ni,
		       int index[],
		       double xi[][3],
		       double vi[][3],
		       double fold[][3],
		       double j6old[][3],
		       double phiold[],
		       double eps2[],
		       double h2[],
		       int  mode)
{
     g6calc_firsthalf0_(&clusterid,&nj,&ni,index, xi, vi,fold,j6old,phiold,eps2,
			h2,&mode);
}

void g6calc_firsthalf_(int * clusterid,
		      int * nj,
		      int * ni,
		      int index[],
		      double xi[][3],
		      double vi[][3],
		      double fold[][3],
		      double j6old[][3],
		      double phiold[],
		      double *eps2,
		      double h2[])
{
    int mode = 1;
    g6calc_firsthalf0_(clusterid,nj,ni,index, xi, vi,fold,j6old,phiold,eps2,
			h2,&mode);
}
    


void g6calc_firsthalf(int  clusterid,
		      int  nj,
		      int  ni,
		      int index[],
		      double xi[][3],
		      double vi[][3],
		      double fold[][3],
		      double j6old[][3],
		      double phiold[],
		      double eps2,
		      double h2[])
{
    g6calc_firsthalf_(&clusterid, &nj,&ni,index, xi,vi,fold,j6old,phiold,
		     &eps2, h2);
}

static index_scalechanged;

int g6_changed_index()
{
    return index_scalechanged;
}
int g6_changed_index_()
{
    return index_scalechanged;
}

int g6calc_lasthalf0a_(int * clusterid,
		     int * nj,
		     int * ni,
		     int index[],
		     double xi[][3],
		     double vi[][3],
		     double *eps2,
		     double h2[],
		     double acc[][3],
		     double jerk[][3],
		     double pot[],
		       int nnbindex[],
		     int *mode)
{
    int flag[MAXPIPELINESPERCHIP];
    int ii;
    int error;
    int recalc = 1;
    int nretry = 0;
    double * eps2p;
    int ii2;
    index_scalechanged = -1;
#ifdef INTERNAL_OUT
    fprintf(stderr,"Enter lasthalf0a %lf\n",*eps2);
#endif    
    while (recalc){
	recalc = 0;
	error = g6_get_force_etc_(clusterid, acc, jerk, pot,nnbindex, flag);
	if (jpbuf_dump_mode)dump_dmawdata(*clusterid, (*ni)*14);
#ifdef 	INTERNAL_OUT
	dump_dmawdata(*clusterid, (*ni)*14);
#endif
	if (error == 0){
	    
	    for (ii= 0; ii< *ni; ii++){
		int flg = g6_test_flag_(clusterid, flag+ii);

		if (flg == COMMERROR){
		    error ++;
		    fprintf(stderr,"(g6calc_lasthalf) communication error  %x -- abort\n",
			    flag[ii]);
		    dump_dmawdata(*clusterid, (*ni)*14);
		    return -1;
		}else if (flg != 0){
		    index_scalechanged = ii;
		    fprintf(stderr,"(g6calc_lasthalf) overflow for  %x  %x  %d %d %d-- change scales\n",
			    flg, flag[ii], ii, index[ii],*ni);
		    g6_adjust_ip_scales_(clusterid,&ii,flag+ii);
		    eps2p = eps2;
		    if (!(*mode & 1)) eps2p = eps2 + ii;
		    g6_set_i_particle_(clusterid, &ii, index+ii, xi[ii], vi[ii], eps2p,
				       h2+ii);
#ifdef DOUBLE_CHECK_RESULT
		    ii2 = ii + *ni;
		    g6_adjust_ip_scales_(clusterid,&ii2,flag+ii);
		    g6_set_i_particle_(clusterid, &ii2, index+ii, xi[ii],
				       vi[ii], eps2p, h2+ii);
#endif		    
		    recalc = 1;
		}
	    }
	    if (nretry > 10) {
		error = 1;
		recalc = 0;
	    }
	    if (recalc){
		g6_set_nip_(clusterid, ni);
		g6_set_njp_(clusterid, nj);
		nretry ++;
	    }
	}
	{
	    static int call_count = 0;
	    call_count ++;
	    if (call_count == 10000){
		GRAPE6_CLUSTER_PTR cp = clusters[*clusterid];
		call_count = 0;
		if( cpu_second()-cp->cputime_at_start>
		    cp->time_hard_limit*2){
		    fprintf(stderr,"Too long usage: %f secs. ",
			    cpu_second()-cp->cputime_at_start );
		    fprintf(stderr," force termination\n ");
		    
		    exit(-1);
		}
		if( cpu_second()-cp->cputime_at_start>
		    cp->time_hard_limit){
		    fprintf(stderr,"Too long usage: %f secs. ",
			    cpu_second()-cp->cputime_at_start );
		    fprintf(stderr," error status forced\n ");
		    
		    return -1;
		}
	    }
	}
	if (error) {
	    fprintf(stderr,"(g6calc_lasthalf) communication error -- abort\n");
	    dump_dmawdata(*clusterid, (*ni)*14);
	    /*	    g6_close_(clusterid);*/
	    /*	    reset_fofpga(clusters[*clusterid]);*/
	    /*	    g6_open_(clusterid);*/
	    g6_print_chip_status(*clusterid);
	    usleep(50000); 
	    return -1;
	}
    }
#ifdef INTERNAL_OUT
    for(ii = 0;ii< *ni; ii++){
	fprintf(stderr,"i, acc, jerk, pot %d  %lf %lf %lf %lf %lf %lf %lf\n",ii,
		acc[ii][0],acc[ii][1],acc[ii][2],
		jerk[ii][0],jerk[ii][1],jerk[ii][2],pot[ii]);
    }
#endif        
    return 0;
}

int g6calc_lasthalf0(int clusterid,
		     int nj,
		     int ni,
		     int index[],
		     double xi[][3],
		     double vi[][3],
		     double *eps2,
		     double h2[],
		     double acc[][3],
		     double jerk[][3],
		     double pot[],
		     int mode)
{
    return   g6calc_lasthalf0_(&clusterid, &nj, &ni, index, xi, vi,eps2,h2,acc, jerk, pot,&mode);
}

int g6calc_lasthalf0a(int clusterid,
		      int nj,
		      int ni,
		      int index[],
		      double xi[][3],
		      double vi[][3],
		      double *eps2,
		      double h2[],
		      double acc[][3],
		      double jerk[][3],
		      double pot[],
		      int nnbindex[],
		      int mode)
{
    return   g6calc_lasthalf0a_(&clusterid, &nj, &ni, index, xi, vi,eps2,h2,acc, jerk, pot,
				nnbindex,&mode);
}


int g6calc_lasthalf2_(int * clusterid,
		     int * nj,
		     int * ni,
		     int index[],
		     double xi[][3],
		     double vi[][3],
		     double *eps2,
		     double h2[],
		     double acc[][3],
		     double jerk[][3],
		     double pot[],
		      int nnbindex[])
{
    int mode = 1;
    return   g6calc_lasthalf0a_(clusterid, nj, ni, index, xi, vi,eps2,h2,acc, jerk, pot,
				nnbindex,&mode);
}

int g6calc_lasthalf0_(int * clusterid,
		     int * nj,
		     int * ni,
		     int index[],
		     double xi[][3],
		     double vi[][3],
		     double *eps2,
		     double h2[],
		     double acc[][3],
		     double jerk[][3],
		     double pot[],
		     int * mode )
{
    int nnbindex[MAXPIPELINESPERCHIP];
    return   g6calc_lasthalf0a_(clusterid, nj, ni, index, xi, vi,eps2,h2,acc, jerk, pot,
				nnbindex,mode);
}

int g6calc_lasthalf_(int * clusterid,
		     int * nj,
		     int * ni,
		     int index[],
		     double xi[][3],
		     double vi[][3],
		     double *eps2,
		     double h2[],
		     double acc[][3],
		     double jerk[][3],
		     double pot[])
{
    int mode = 1;
    return   g6calc_lasthalf0_(clusterid, nj, ni, index, xi, vi,eps2,h2,acc, jerk, pot,
				&mode);
}
    
int g6calc_lasthalf2(int  clusterid,
		     int  nj,
		     int  ni,
		     int index[],
		     double xi[][3],
		     double vi[][3],
		     double eps2,
		     double h2[],
		     double acc[][3],
		     double jerk[][3],
		     double pot[],
		    int nnbindex[])
{
    return     g6calc_lasthalf2_(&clusterid, &nj, &ni, index, xi, vi,
			       &eps2, h2, acc, jerk, pot,nnbindex);
}
int g6calc_lasthalf(int  clusterid,
		     int  nj,
		     int  ni,
		     int index[],
		     double xi[][3],
		     double vi[][3],
		     double eps2,
		     double h2[],
		     double acc[][3],
		     double jerk[][3],
		     double pot[])
{
    return     g6calc_lasthalf_(&clusterid, &nj, &ni, index, xi, vi,
			       &eps2, h2, acc, jerk, pot);
}
 


static int calculate_accel_scaling = 1 ;

int set_calculate_accel_scaling_mode(int mode)
{
    calculate_accel_scaling =mode;
    return calculate_accel_scaling;
}
int g6_set_calculate_accel_scaling_mode(int mode)
{
    return set_calculate_accel_scaling_mode(mode);
}

int calculate_accel_by_grape6_separate_trial_noopen(int clusterid,
					     int ni,
					     double xi[][3],
					     double vi[][3],
					     int nj,
					     double xj[][3],
					     double vj[][3],
					     double m[],
					     double a[][3],
					     double jerk[][3],
					     double pot[],
					     double eps2)
{
#define IPLIMIT MAXPIPELINESPERCHIP
    double ajtmp[3];
    double jjtmp[3];
    double j2jtmp[3];
    double ti, tj, dtj;
    int j0, i0;
    int i,k,ii, iend;
    int flag[MAXPIPELINESPERCHIP+1];
    int index[MAXPIPELINESPERCHIP+1];
    double h2[MAXPIPELINESPERCHIP+1];
    double eps2array[MAXPIPELINESPERCHIP+1];
    int nharderror = 0;
    int ipmax = g6_npipes();
    ti = 0.0;tj =0.0; dtj = 0.0078125;
    for(k=0;k<3;k++){
	ajtmp[k] = 0.0;
	jjtmp[k] = 0.0;
	j2jtmp[k] = 0.0;
    }
START:
    g6_set_ti_(&clusterid, &ti);
    g6_initialize_jp_buffer(clusterid, nj);
    for(i=0;i<nj;i++){
	g6_set_j_particle_(&clusterid,&i,&i,&tj,&dtj,m+i,
			   j2jtmp,jjtmp,ajtmp,vj[i],xj[i]);
    }
    g6_flush_jp_buffer(clusterid);
    if(calculate_accel_scaling){
	g6_guestimate_acc_etc(ni, eps2, m, a, jerk, pot);
    }

#ifdef TESTMAIN
    usleep(100);
    g6setcbjpmode(clusters[clusterid]->ipboardid, 15, 0, 2); 
    usleep(100);
    g6reset_pbonly(clusters[clusterid]->ipboardid);
    usleep(100);
#endif
    
    for(i=0;i<ipmax;i++){
	h2[i] = eps2;
	eps2array[i] = eps2;
    }
    for(i=0;i<ni;i+=ipmax){
	int error;
	iend = ipmax; if  (iend+i > ni) iend = ni-i;
	for(ii=0;ii<iend;ii++)index[ii]=i+ii;
	g6calc_firsthalf_(&clusterid, &nj, &iend,index,&(xi[i]),&(vi[i]),&(a[i]),&(jerk[i]),
			  pot+i,eps2array, h2);
	if (error = g6calc_lasthalf_(&clusterid,&nj,&iend,index,&xi[i],&vi[i],eps2array,h2,
				     &a[i], &jerk[i], pot+i)){
#ifdef JPIPTEST
	    fprintf(stderr,"Hard error, %x return -1\n", error);
	    {
#if 1		
		int x;
		fprintf(stderr,"Enter some number to continue:");
		scanf("%d", &x);
#else
		sleep(4);
#endif		
	    }
	    return -1;
#endif
#ifndef TESTMAIN	    
#define	 ERRMAX 10
#else
#define ERRMAX 2
#endif	    
	    nharderror ++;
	    /*	    g6_print_chip_status(clusterid);*/
	    fprintf(stderr,"(calculate_accell_trial_noopen) hard error %d\n", error);
	    g6_reset_(&clusterid);
	    g6_reset_fofpga_(&clusterid);
	    fprintf(stderr,"(calculate_accell_trial_noopen) calling g6close\n");
	    g6_close_(&clusterid);
	    fprintf(stderr,"(calculate_accell_trial_noopen) calling g6open\n");
	    g6_open_(&clusterid);
	    if (nharderror < ERRMAX){
		goto START;
	    }else{
		fprintf(stderr,"(calculate_accell_trial_noopen) too many errors... %d returning -1\n", nharderror);
		return -1;
	    }
	}
#ifdef JPIPTEST
	dump_dmawdata(clusterid, iend*14);
#endif
    }
    return 0;
}

static double tol2a = 1e-20;
static double tol2j = 1e-0;
int check_newton_3rd_law(int n,
			  double m[],
			  double acc[][3],
			  double jerk[][3])
{
    int i, k, error;
    double acc2sum, jerk2sum, msum,accsum[3], jerksum[3];
    acc2sum = jerk2sum = msum = 0;
    for(k = 0;k<3;k++)accsum[k] = jerksum[k] = 0;
    for(i =0;i<n;i++){
	for(k = 0;k<3;k++){
	    double x;
	    x = acc[i][k]*m[i];
	    accsum[k] +=x;
	    acc2sum += x*x;
	    x = jerk[i][k]*m[i];

	    jerksum[k] +=x;
	    jerk2sum += x*x;

	}
	msum += m[i];
    }
    error = 0;
    for(k=0;k<3;k++){
	if (acc2sum > 0.0) if( accsum[k]*accsum[k]>=acc2sum*tol2a)error = 1;
	if (jerk2sum > 0.0) if( jerksum[k]*jerksum[k]>=jerk2sum*tol2j)error = 1;
    }
#ifdef INTERNAL_OUT
	fprintf(stderr,"check_3rd_law, a, j = %e %e %e %e\n       %e %e %e %e\n",
		accsum[0],accsum[1],accsum[2],acc2sum,
		jerksum[0],jerksum[1],jerksum[2],jerk2sum);
#endif
    if(error){
	fprintf(stderr,"check_3rd_law, a, j = %e %e %e %e %e %e %e %e\n",
		accsum[0],accsum[1],accsum[2],acc2sum,
		jerksum[0],jerksum[1],jerksum[2],jerk2sum);
    }
    return error;
}
    

static void dump_force_and_phi(int n,
			  double acc[][3],
			  double phi[])
{
    int i;
    for(i =0;i<n;i++){
	fprintf(stderr,"%5d  %23.16e %23.16e %23.16e %23.16e\n",
		i,acc[i][0],acc[i][1],acc[i][2],phi[i]);
    }
}
    


int calculate_accel_by_grape6_noopen(int clusterid,
					     int n,
					     double x[][3],
					     double v[][3],
					     double m[],
					     double a[][3],
					     double jerk[][3],
					     double pot[],
					     double eps2)
{
    int error, error_prev;
    int errorcount = 0;
    error_prev = 0;
    do{
	/*	set_simulator_use(clusters[clusterid], 0);*/
#ifdef JPIPTEST
	set_simulator_use(clusters[clusterid], 1);
#endif
#ifdef TESTMAIN
	set_simulator_use(clusters[clusterid], 1);
#endif
	error = calculate_accel_by_grape6_separate_trial_noopen(clusterid,n,x,v,n,x,v,
							m,a,jerk, pot, eps2);
	if (!error) error = check_newton_3rd_law(n,m,a,jerk);
	if (error) 	{
	    fprintf(stderr,"Force in Error\n");
#ifdef INTERNAL_OUT
	    dump_force_and_phi(n,a,pot);
#endif
	    errorcount ++;
	    g6reset(clusters[clusterid]->ipboardid);
	    clusters[clusterid]->errorcounters.cmerror_count++;
	    set_simulator_use(clusters[clusterid], 0);
	    sleep(1);
	    if (errorcount > ERRMAX){
		fprintf(stderr,"Too many errors, exiting...\n");
		exit(-1);
	    }
	    error_prev = 1;
	}else{
	    if (error_prev){
		fprintf(stderr,"Force Corrected\n");
#ifdef INTERNAL_OUT
		dump_force_and_phi(n,a,pot);
#endif
	    }
	}
#ifdef JPIPTEST
	/* here, ignore error and continue ... */
	return 0;
#endif	
    }while(error);
    set_simulator_use(clusters[clusterid], 0);
    return error;
}



    
#ifdef TESTMAIN

#define NMAX  100000
int main(int argc, char **argv)
{
    static  double x[NMAX][3];
    static      double v[NMAX][3];
    static      double a2[NMAX][3];
    static      double j6[NMAX][3];
    static      double jd18[NMAX][3];
    static      double acc[NMAX][3];
    static      double jerk[NMAX][3];
    static      double pot[NMAX];
    static      int    flag[NMAX];
    static      double m[NMAX];
    static      double t[NMAX];
    static      double dt[NMAX];
    int i,k,n,ii;
    int clusterid = 0;
    int mode = 0;
    double ti = 0;
    double eps2 = 0.01;
    double h2 = 2;
    n = 100;
    if (argc > 1){
	clusterid = atoi(argv[1]);
    }
    fprintf(stderr,"clusterid = %d\n", clusterid);
    if (argc > 2){
	n = atoi(argv[2]);
    fprintf(stderr,"n = %d\n", n);
    }
    if (argc <= 3 ){
	fprintf(stderr,"Use default mode = %d\n", mode);
	fprintf(stderr," mode 0 = local broadcast\n");
	fprintf(stderr," mode 1 = 2-way multicast\n");
	fprintf(stderr," mode 2 = p-to-p\n");
    }else{
	mode = atoi(argv[3]);
    }
    fprintf(stderr,"mode = %d\n", mode);

    srand48(12345);
    for(i=0;i<n;i++){
	for(k=0;k<3;k++){
	    x[i][k] = (n-i-1)*(k+2)/4.0;
#if 1
	    x[i][k] = drand48();
	    v[i][k] = drand48();
#endif
	    a2[i][k] = j6[i][k] = jd18[i][k] = 0;
	    acc[i][k] = jerk[i][k] = 1;
	}
	m[i] = 1.0/n;
	t[i]  = 0;
	dt[i] = 0.125;
	pot[i] = 1;
    }
#ifdef INTERNAL_OUT
    jpbuf_dump_mode = 1;
#endif    
    g6_open_(&clusterid);

    g6_initialize_jp_buffer(clusterid,10);

    if (1){
	double fscale = m[0]/eps2*100;
	double phiscale = m[0]/sqrt(eps2)*100;
	for(i=0;i<n;i++){
	    for(k=0;k<3;k++)acc[i][k] = fscale;
	    pot[i] = phiscale;
	}
	
	for(ii = 0;ii<10;ii++){
	    fprintf(stderr,"step %d\n", ii);
#if 1
	    for(i=0;i<n;i++){
		for(k=0;k<3;k++){
		    x[i][k] = drand48();
		    v[i][k] = drand48();
		}
	    }
#endif
	    g6setcbjpmode(clusters[clusterid]->ipboardid, 15, mode, 2); 
	    usleep(100);
	    g6reset_pbonly(clusters[clusterid]->ipboardid);
	    usleep(100);

	    if (calculate_accel_by_grape6_noopen(clusterid,n,x,v,m,acc,jerk,pot,eps2)){
		fprintf(stderr,"Force calculation error, exit\n");
		exit(-1);
	    }
			
	}
	for(i=0;i<n;i++){
	    printf("i:%5d acc: %e  %e  %e \n          jerk %e  %e  %e \n pot, flag: %e %x\n",
		   i, acc[i][0], acc[i][1], acc[i][2],
		   jerk[i][0], jerk[i][1], jerk[i][2],
		   pot[i], flag[i]);
	}
    }
    g6_close_(&clusterid);
    return 0;
}
#endif
#ifdef MCTESTMAIN

#define NMAX  100000
void main(int argc, char **argv)
{
    static  double x[NMAX][3];
    static      double v[NMAX][3];
    static      double a2[NMAX][3];
    static      double j6[NMAX][3];
    static      double jd18[NMAX][3];
    static      double acc[NMAX][3];
    static      double jerk[NMAX][3];
    static      double pot[NMAX];
    static      int    flag[NMAX];
    static      double m[NMAX];
    static      double t[NMAX];
    static      double dt[NMAX];
    int i,k,n,ir;
    int clusterid = 0;
    int cid1 = 1;
    double ti = 0;
    double eps2 = 0.01;
    double h2 = 2;

    g6_open_(&clusterid);
    g6_open_(&cid1);

    if (argc == 1){
	fprintf(stderr,"Use default n = %d\n", 100);
	n = 100;
    }else{
	n = atoi(argv[1]);
    }
    fprintf(stderr,"n = %d\n", n);

    srand48(12345);
    for(i=0;i<n;i++){
	for(k=0;k<3;k++){
	    x[i][k] = (n-i-1)*(k+2)/4.0;
#if 1
	    x[i][k] = drand48();
	    v[i][k] = drand48();
#endif
	    a2[i][k] = j6[i][k] = jd18[i][k] = 0;
	    acc[i][k] = jerk[i][k] = 1;
	}
	m[i] = 1.0/n;
	t[i]  = 0;
	dt[i] = 0.125;
	pot[i] = 1;
    }


    if (1){
	double fscale = m[0]/eps2*100;
	double phiscale = m[0]/sqrt(eps2)*100;
	g6reset(clusters[0]->ipboardid);
	g6reset(clusters[1]->ipboardid);
	g6errorcounter_clear(0);
	g6errorcounter_clear(1);
	for(i=0;i<n;i++){
	    for(k=0;k<3;k++)acc[i][k] = fscale;
	    pot[i] = phiscale;
	}
	
	for(ir = 0;ir<10000;ir++){
	    double ajtmp[3];
	    double jjtmp[3];
	    double j2jtmp[3];
	    double ti, tj, dtj;
	    int j0, i0;
	    int i,k,ii, iend;
	    int flag[MAXPIPELINESPERCHIP+1];
	    int index[MAXPIPELINESPERCHIP+1];
	    double h2[MAXPIPELINESPERCHIP+1];
	    double eps2array[MAXPIPELINESPERCHIP+1];
	    int nharderror = 0;
	    int ipmax = g6_npipes();
	    fprintf(stderr,"step %d\n", ir);
#if 1
	    for(i=0;i<n;i++){
		for(k=0;k<3;k++){
		    x[i][k] = drand48();
		    v[i][k] = drand48();
		}
	    }
#endif
	    set_simulator_use(clusters[clusterid], 1);
	    ti = 0.0;tj =0.0; dtj = 0.0078125;
	    for(k=0;k<3;k++){
		ajtmp[k] = 0.0;
		jjtmp[k] = 0.0;
		j2jtmp[k] = 0.0;
	    }
	START:
	    g6_set_ti_(&clusterid, &ti);
	    g6_set_ti_(&cid1, &ti);

	    usleep(10);
	    g6setcbjpmode(clusters[0]->ipboardid, 15, 1, 2);
	    g6setcbjpmode(clusters[1]->ipboardid, 15, 1, 2);
	    usleep(10);
	    g6reset_pbonly(clusters[0]->ipboardid);
	    g6reset_pbonly(clusters[1]->ipboardid);
	    
	    g6_set_ijp_mode(0,1);
	    g6_set_ijp_mode(1,1);
	    for(i=0;i<n;i++){
		g6_set_j_particle_(&clusterid,&i,&i,&tj,&dtj,m+i,
				   j2jtmp,jjtmp,ajtmp,v[i],x[i]);
		g6_set_j_particle_(&cid1,&i,&i,&tj,&dtj,m+i,
				   j2jtmp,jjtmp,ajtmp,v[i],x[i]);
	    }
	    usleep(10);
	    g6_set_ijp_mode(0,0);
	    g6_set_ijp_mode(1,0);
	    g6_guestimate_acc_etc(n, eps2, m, acc, jerk, pot);
#if 1
	    usleep(10);
	    g6setcbjpmode(clusters[0]->ipboardid, 15, 0, 2); 
	    g6setcbjpmode(clusters[1]->ipboardid, 15, 0, 2);
	    usleep(10);
	    g6reset_pbonly(clusters[0]->ipboardid);
	    g6reset_pbonly(clusters[1]->ipboardid);
	    usleep(10);
#endif
	    for(i=0;i<ipmax;i++){
		h2[i] = eps2;
		eps2array[i] = eps2;
	    }
	    for(i=0;i<n;i+=ipmax){
		int error;
		iend = ipmax; if  (iend+i > n) iend = n-i;
		for(ii=0;ii<iend;ii++)index[ii]=i+ii;
		g6calc_firsthalf_(&clusterid, &n, &iend,index,&(x[i]),&(v[i]),&(acc[i]),&(jerk[i]),
				  pot+i,eps2array, h2);
		if (error = g6calc_lasthalf_(&clusterid,&n,&iend,index,&x[i],&v[i],eps2array,h2,
					     &acc[i], &jerk[i], pot+i)){
		    nharderror ++;
		    fprintf(stderr,"(calculate_accell_trial_noopen) hard error %d\n", error);
		    g6_reset_(&clusterid);
		    g6_reset_fofpga_(&clusterid);
		    g6_close_(&clusterid);
		    g6_open_(&clusterid);
		    g6_reset_(&cid1);
		    g6_reset_fofpga_(&cid1);
		    g6_close_(&cid1);
		    g6_open_(&cid1);
		    if (nharderror < 10){
			goto START;
		    }else{
			fprintf(stderr,"(calculate_accell_trial_noopen) too many errors... %d returning -1\n", nharderror);
			exit(-1);
		    }
		}
	    }
	}
	    /*	    calculate_accel_by_grape6_noopen(clusterid,n,x,v,m,acc,jerk,pot,eps2);*/
	for(i=0;i<n;i++){
	    printf("i:%5d acc: %e  %e  %e \n          jerk %e  %e  %e \n pot, flag: %e %x\n",
		   i, acc[i][0], acc[i][1], acc[i][2],
		   jerk[i][0], jerk[i][1], jerk[i][2],
		   pot[i], flag[i]);
	}
    }
}
#endif


#ifdef JPIPTEST

#define NMAX  10
int main(void)
{
    static  double x[NMAX][3];
    static      double v[NMAX][3];
    static      double a2[NMAX][3];
    static      double j6[NMAX][3];
    static      double jd18[NMAX][3];
    static      double acc[NMAX][3];
    static      double jerk[NMAX][3];
    static      double pot[NMAX];
    static      int    flag[NMAX];
    static      double m[NMAX];
    static      double t[NMAX];
    static      double dt[NMAX];
    int i,k,n,ii, itest;
    int clusterid = 0;
    double ti = 0;
    double eps2 = 0.00000000000000001;
    double h2 = 2;
    double fscale, 	 phiscale;
    n = 2;
    g6_open_(&clusterid);

    /*    set_debug_level(2);*/
      
    while(1)    
	for (itest = 0; itest < 53; itest ++){
	for(i=0;i<n;i++){
	    for(k=0;k<3;k++){
		x[i][k] = 0;
		a2[i][k] = j6[i][k] = jd18[i][k] = 0;
		acc[i][k] = jerk[i][k] = 1;
	    }
	    m[i] = 1.0/n;
	    t[i]  = 0;
	    dt[i] = 0.125;
	    pot[i] = 1;
	}
	x[0][0] = 0.0;
	x[1][0] =  scalb(1.0, -itest);
	/*	x[1][0] =  scalb(1.0, -itest) +scalb(1.0, -(itest+1));*/
	fscale = 1.0/(x[0][0]-x[1][0])/(x[0][0]-x[1][0]);
	phiscale = 1.0/(x[0][0]-x[1][0]);
	g6reset(clusters[0]->ipboardid);
	g6errorcounter_clear(0);
	for(i=0;i<n;i++){
	    for(k=0;k<3;k++)acc[i][k] = fscale;
	    pot[i] = phiscale;
	}
	fprintf(stderr,"step %d \n", itest);
	calculate_accel_by_grape6_noopen(clusterid,n,x,v,m,acc,jerk,pot,eps2);
	for(i=0;i<n;i++){
	    fprintf(stderr,"i:%5d acc[0]=%e, phi= %e  \n", i, acc[i][0], pot[i]);
	}
    }
    return 0;
}
#endif


#ifdef CBTESTMAIN

#define NMAX  100000
int main(int argc, char **argv)
{
    
    int clusterid = 0;
    int boardid;
    int fboardid;
    unsigned int datain[16384];
    unsigned int dataout[16384];
    int error;
    int ndata;
    int i,nwords, iloop;
    int nerror = 0;
    int ntry = 0;
    int mode = 0;
    int nocheck = 0;
    nwords=1024;
    
    if (argc == 1){
	fprintf(stderr,"Use default mode = %d\n", mode);
	fprintf(stderr," mode 0 = local broadcast\n");
	fprintf(stderr," mode 1 = 2-way multicast\n");
	fprintf(stderr," mode 2 = p-to-p\n");
    }else{
	mode = atoi(argv[1]);
    }
    fprintf(stderr,"mode = %d\n", mode);
    
    g6_open_(&clusterid);
    boardid = clusters[clusterid]->ipboardid;
    fboardid = clusters[clusterid]->foboardid;
    setjpspace(boardid,clusters[clusterid]->system_version,18,18);
    fprintf(stderr, "clusterid, boardid = %d %d %d\n", clusterid, boardid, fboardid);
    for(i=0;i<nwords;i++)datain[i] = 1<<i;
    g6setcbjpmode(clusters[0]->ipboardid, 15, mode, 2);
#if 0

    while(1){
	int mode;
	fprintf( stderr,"Enter mode:"); 
	scanf("%d",&mode);
    	g6setcbjpmode(clusters[0]->ipboardid, 15, mode, 2);
    }
#endif
    g6_set_ijp_mode(0,1);

    srandom(12345);
    for(iloop=0;iloop<200;iloop++){
	usleep(10);
#if 0
	if ((ntry % 2) == 0){
	    for(i=0;i<nwords;i++)datain[i] = 1<<i;
	}else{
	    for(i=0;i<nwords;i++)datain[i] = 1<<(nwords-i-1);
	}	    
#endif
	for(i=0;i<nwords;i++)datain[i] = random();
#if 0
	for(i=0;i<nwords;i++){
	    if (datain[i] &1){
		datain[i]=0xffffffff;
	    }else{
		datain[i]=0x0;
	    }
	}
#endif
	counterclear(clusters[clusterid]->foboardid);
#if 0
	g6hib_printcounter(boardid);
#endif
	/*	for(i=0;i<nwords;i++)linkwrite(boardid, IPLINK, 1,datain+i);*/
#ifndef DMACBTEST
	linkwrite(boardid, IPLINK, nwords,datain);
#else
	/* As of 2002/2/5, the use of DMA here for CBTEST
	   is deactivated because this causes too high error rate
	   */
	wait_dma_to_finish(fboardid);
	uwait(1000);
	linkwrite_dma(clusterid, IPLINK, nwords,datain);
	uwait(1000);
	wait_dma_to_finish(fboardid);
#endif

	if (!nocheck){
	    int error = 0;
	    while ((ndata=g6hib_getcounter(fboardid)) <nwords){
		uwait(10000);
		fprintf(stderr,"Waiting for data ... %d %d\r",nwords,ndata);
	    }
	    /*	    fprintf(stderr,"Ndata= %d ", ndata);*/
#if 0
	    g6hib_printcounter(boardid);
	    g6hib_printcounter(boardid);
	    g6hib_printcounter(boardid);
	    g6hib_printcounter(boardid);
	    g6hib_printcounter(boardid);
#endif
	    if (ndata !=nwords) error=1;
	    error |= linkread(fboardid,nwords+8, dataout);  
#if 0
	    for(i=0;i<10;i++)
		fprintf(stderr,"Data in = %8x, Data returned = %8x, diff = %8x\n",
			datain[i], dataout[i], datain[i]^dataout[i]);
#endif
	    for(i=0;i<nwords;i++) if (datain[i] !=  dataout[i]) error ++;
	    if (error){
		nerror ++;
		for(i=0;i<nwords+8;i++)
		    fprintf(stderr,"i=%4d Data in = %8x, Data returned = %8x, diff = %8x\n",
			i,datain[i], dataout[i], datain[i]^dataout[i]);
	    }
		
	    ntry++;
	    fprintf(stderr,"Total errors = %d/%d\r", nerror, ntry);
	}
	
    }
	fprintf(stderr,"\n");
    exit(nerror);
}
#endif


#ifdef TIMETEST





void wall_init()
{
    tstart = wall_second();
}



double cpu0, wall0;
static void timer_init(char * s)
{
    printf("Timer init for test %s\n", s);
    wall0 = wall_second();
    cpu0 = cpu_second();
}

static void timer_out(char * s, double nbytes)
{
    double wall, cpu;
    wall = wall_second();
    cpu = cpu_second();
    printf("%s, Wall = %12.5e CPU= %12.5e  %12.3fMB/s %12.3fMB/s\n",s,
	   wall - wall0, cpu-cpu0, nbytes/(wall-wall0)/1e6,
	   nbytes/(cpu-cpu0)/1e6);
}



int main(int argc, char **argv)
{
    
    int clusterid = 0;
    int boardid;
    int fboardid;
    unsigned int datain[16384];
    unsigned int dataout[16384];
    int i,nwords, nrepeat, testlen, nbytes;
    nwords=1024;
    testlen = 1024*10000;
    nbytes = testlen*4;
    nrepeat = testlen/nwords;
    g6_open_(&clusterid);
    boardid = clusters[clusterid]->ipboardid;
    fboardid = clusters[clusterid]->foboardid;
    fprintf(stderr, "clusterid, boardid = %d %d %d\n", clusterid, boardid, fboardid);
    for(i=0;i<nwords;i++)datain[i] = 1<<(i & 31);
    timer_init("DUMMY LOOP");
    dummyloop(testlen);
    timer_out("DUMMY LOOP", testlen);
    
#ifdef DMA
    timer_init("DMA READ (buffered send)");
    for(i=0;i<nrepeat;i++){
	linkwrite_dma(clusterid, IPLINK, nwords,datain);
	uwait(50);
    }
    timer_out("DMA READ (buffered send)", nbytes);
    timer_init("DMA READ (raw send)");
    for(i=0;i<nrepeat;i++){
	g6hib_dmar(clusters[clusterid]->foboardid,clusters[clusterid]->dma_offset,
		   setbaseadr(1)*sizeof(int),		   nwords);
	uwait(50);
	wait_dma_to_finish(clusters[clusterid]->foboardid);
    }
    timer_out("DMA READ (raw send)", nbytes);
    timer_init("DMA WRITE (receive)");
    for(i=0;i<nrepeat;i++){
	g6hib_dmaw(clusters[clusterid]->foboardid,clusters[clusterid]->dma_offset,
		   nwords);
	dummyloop(100);
	wait_dma_to_finish(clusters[clusterid]->foboardid);
    }
    timer_out("DMA WRITE (receive", nbytes);

#endif
    timer_init("PIO WRITE");
    for(i=0;i<nrepeat;i++){
	linkwrite(boardid, IPLINK, nwords,datain);
    }
    timer_out("PIO WRITE", nbytes);
    timer_init("PIO READ");
    for(i=0;i<nrepeat/5;i++){
	linkread(boardid,  nwords,dataout);
    }
    timer_out("PIO READ", nbytes/5);
    return 0;
}
#endif





#ifdef CBTEST2

#define NMAX  100000
int main(void)
{
    static  double x[NMAX][3];
    static      double v[NMAX][3];
    static      double a2[NMAX][3];
    static      double j6[NMAX][3];
    static      double jd18[NMAX][3];
    static      double acc[NMAX][3];
    static      double jerk[NMAX][3];
    static      double pot[NMAX];
    static      int    flag[NMAX];
    static      double m[NMAX];
    static      double t[NMAX];
    static      double dt[NMAX];
    int i,k,n,ir;
    int clusterid = 0;
    int cid1 = 1;
    double ti = 0;
    double eps2 = 0.01;
    double h2 = 2;
    n = 100;
    srand48(12345);
    for(i=0;i<n;i++){
	for(k=0;k<3;k++){
	    x[i][k] = (n-i-1)*(k+2)/4.0;
#if 1
	    x[i][k] = drand48();
	    v[i][k] = drand48();
#endif
	    a2[i][k] = j6[i][k] = jd18[i][k] = 0;
	    acc[i][k] = jerk[i][k] = 1;
	}
	m[i] = 1.0/n;
	t[i]  = 0;
	dt[i] = 0.125;
	pot[i] = 1;
    }
    g6_open_(&clusterid);

    if (1){
	double fscale = m[0]/eps2*100;
	double phiscale = m[0]/sqrt(eps2)*100;
	g6reset(clusters[0]->ipboardid);
	g6errorcounter_clear(0);
	for(i=0;i<n;i++){
	    for(k=0;k<3;k++)acc[i][k] = fscale;
	    pot[i] = phiscale;
	}
	
	for(ir = 0;ir<10;ir++){
	    double ajtmp[3];
	    double jjtmp[3];
	    double j2jtmp[3];
	    double ti, tj, dtj;
	    int j0, i0;
	    int i,k,ii, iend;
	    int flag[MAXPIPELINESPERCHIP+1];
	    int index[MAXPIPELINESPERCHIP+1];
	    double h2[MAXPIPELINESPERCHIP+1];
	    double eps2array[MAXPIPELINESPERCHIP+1];
	    int nharderror = 0;
	    int ipmax = g6_npipes();
	    fprintf(stderr,"step %d\n", ir);
#if 1
	    for(i=0;i<n;i++){
		for(k=0;k<3;k++){
		    x[i][k] = drand48();
		    v[i][k] = drand48();
		}
p	    }
#endif
	    set_simulator_use(clusters[clusterid], 1);
	    ti = 0.0;tj =0.0; dtj = 0.0078125;
	    for(k=0;k<3;k++){
		ajtmp[k] = 0.0;
		jjtmp[k] = 0.0;
		j2jtmp[k] = 0.0;
	    }
	START:
	    g6_set_ti_(&clusterid, &ti);
	    g6setcbjpmode(clusters[0]->ipboardid, 15, 2, 2); 
	    for(i=0;i<n;i++){
		g6_set_j_particle_(&clusterid,&i,&i,&tj,&dtj,m+i,
				   j2jtmp,jjtmp,ajtmp,v[i],x[i]);
	    }
	    usleep(5);
	    g6setcbjpmode(clusters[0]->ipboardid, 15, 0, 2); /* mode set ... local bcast */

	    g6_guestimate_acc_etc(n, eps2, m, acc, jerk, pot);
#if 0
	    set_ijp_mode(clusters[0]->ipboardid); /* hmm, should not be necessary */ 
	    g6setcbjpmode(clusters[0]->ipboardid, 15, 0, 2); /* mode set ... local bcast */
	    set_ijp_mode(clusters[0]->ipboardid); /* hmm, should not be necessary */ 
#endif
	    for(i=0;i<ipmax;i++){
		h2[i] = eps2;
		eps2array[i] = eps2;
	    }
	    for(i=0;i<n;i+=ipmax){
		int error;
		iend = ipmax; if  (iend+i > n) iend = n-i;
		for(ii=0;ii<iend;ii++)index[ii]=i+ii;
		g6calc_firsthalf_(&clusterid, &n, &iend,index,&(x[i]),&(v[i]),&(acc[i]),&(jerk[i]),
				  pot+i,eps2array, h2);
		if (error = g6calc_lasthalf_(&clusterid,&n,&iend,index,&x[i],&v[i],eps2array,h2,
					     &acc[i], &jerk[i], pot+i)){
		    nharderror ++;
		    fprintf(stderr,"(calculate_accell_trial_noopen) hard error %d\n", error);
		    g6_reset_(&clusterid);
		    g6_reset_fofpga_(&clusterid);
		    g6_close_(&clusterid);
		    g6_open_(&clusterid);
		    if (nharderror < 10){
			goto START;
		    }else{
			fprintf(stderr,"(calculate_accell_trial_noopen) too many errors... %d returning -1\n", nharderror);
			exit(-1);
		    }
		}
	    }

	}
	for(i=0;i<n;i++){
	    printf("i:%5d acc: %e  %e  %e \n          jerk %e  %e  %e \n pot, flag: %e %x\n",
		   i, acc[i][0], acc[i][1], acc[i][2],
		   jerk[i][0], jerk[i][1], jerk[i][2],
		   pot[i], flag[i]);
	}
    }
    return 0;
}
#endif

#ifdef FPGATEST


void main(void)
{
    int clusterid = 0;
    while(1){
	g6_open_(&clusterid);
	sleep(1);
	g6_reset_(&clusterid);
	g6_reset_fofpga_(&clusterid);
	g6_close_(&clusterid);
    }
}
#endif
