collideAndStream() versus collide()+stream()


I am getting strange results when I try to replace the call to collideAndStream() by separate calls to collide() and stream(). I was under the impression that the two should be equivalent, but apparently they are not.
The code below (based on the permeability-tutorial) produces nice results when SEPARATE_CS is undefined, so that collideAndStream() is called, but if I define SEPARATE_CS, so that the code calls first collide() and then stream(), the results are different and very strange.
I have tried this with version 0.7r2 (compiled with MPI but run on a single processor) and for different lattices and boundary conditions and always get strange results. Is there a reason for this or is it a bug?
(Not that it matters very much for most problems, but I was playing around trying to do some things between the collision step and the streaming step when I discovered this and found it rather annoying.)


#include "palabos3D.h"
#include "palabos3D.hh"

using namespace plb;


typedef double T;
#define DESCRIPTOR descriptors::D3Q19Descriptor

struct PressureGradient
	PressureGradient(T deltaP_, plint nx_) : deltaP(deltaP_), nx(nx_) {}
	void operator()(plint iX, plint iY, plint iZ, T& density, Array<T, 3>& velocity) const
		density = 1. - deltaP * DESCRIPTOR<T>::invCs2 / (T)(nx-1) * (T)iX;
	T deltaP;
	plint nx;

int main(int argc, char **argv)
	plbInit(&argc, &argv);

	const T omega = 1.0;
	const T deltaP = 0.0001;
	const plint nx=30, ny=30, nz=30;
	MultiBlockLattice3D<T,DESCRIPTOR> lattice(nx,ny,nz, new BGKdynamics<T,DESCRIPTOR>(omega));

	OnLatticeBoundaryCondition3D<T, DESCRIPTOR>* bc =
	Box3D inlet(0, 0, 0, ny - 1, 0, nz - 1);
	bc->addPressureBoundary0N(inlet, lattice);
	setBoundaryDensity(lattice, inlet, 1.);

	Box3D outlet(nx - 1, nx - 1, 0, ny - 1, 0, nz - 1);
	bc->addPressureBoundary0P(outlet, lattice);
	setBoundaryDensity(lattice, outlet, 1. - deltaP * DESCRIPTOR<T>::invCs2);

	initializeAtEquilibrium( lattice, lattice.getBoundingBox(), PressureGradient(deltaP, nx) );

	for (plint iT=0;iT<500; ++iT)
	VtkImageOutput3D<T> vtkOut("vtk_out", 1.);
	vtkOut.writeData<float>(*computeDensity(lattice), "pressure", DESCRIPTOR<T>::cs2);
	vtkOut.writeData<3, float>(*computeVelocity(lattice), "velocity", 1.);
	return 0;