17.5.2023

Optimization of Python object structures using Cython(Python object data structures can be fast)

If you are building your project using object-oriented design, which includes many individual classes, you have probably encountered a speed problem from a certain point in the project. Since I've been dealing with this problem and it took me some time to find and test a workable solution, I'm writing this short article with a summary of the ways in which optimization can be done.

Compiling classes to C using Cython alone, although the class methods may be perfectly optimized, does not lead to the desired speedup. An example of this is large collections of instances with references to other instances, where without the necessary optimization, the use of object structures adds an order of magnitude and possibly more orders of complexity.

As an example, consider the following simple structure:

import random
from collections import deque


class GameTilePy:
	_parent = None
	_pos_x = 0
	_pos_y = 0
	_flag = 0
	_neightbors = None

	visited = False
	back_ref = None

	def __init__(self, parent, pos_x, pos_y, flag):
		self._parent = parent
		self._pos_x = pos_x
		self._pos_y = pos_y
		self._flag = flag
		self._neightbors = []

	def get_coords(self):
		return self._pos_x, self._pos_y

	def get_pos_x(self):
		return self._pos_x

	def get_pos_y(self):
		return self._pos_y

	def get_flag(self):
		return self._flag

	def get_neightbors(self):
		return self._neightbors

	def update(self):
		self._neightbors = []

		for fy in range(-1, 2):
			for fx in range(-1, 2):
				tile = self._parent.get_tile(self._pos_x + fx, self._pos_y + fy)

				if tile is not None:
					self._neightbors.append(tile)


class GameMapPy:
	_width = 0
	_height = 0
	_area = []

	def __init__(self, width, height):
		self._width = width
		self._height = height
		self._area = []

		for fy in range(0, self._height):
			self._area.append( list(( GameTilePy(self, fx, fy, 1 if random.randint(0, 10) == 0 else 0) for fx in range(0, self._width) )) )

		for fy in range(0, self._height):
			[self._area[fy][fx].update() for fx in range(0, self._width)]

	def get_tile(self, pos_x, pos_y):
		return self._area[pos_y][pos_x] if pos_x >= 0 and pos_y >= 0 and pos_x < self._width and pos_y < self._height else None

	def get_leaf_tile(self):
		return self.get_tile(self._width - 1, self._height - 1)

	def path(self, tile_from, tile_to):
		for fy in range(0, self._height):
			for fx in range(0, self._width):
				self._area[fy][fx].visited = False

		is_found = False

		dq = deque()
		dq.append(tile_from)

		while dq:
			cur = dq.popleft()
			cur.visited = True

			if cur is tile_to:
				is_found = True
				break

			for nb in cur.get_neightbors():
				if not nb.visited and nb.get_flag() == 0:
					nb.visited = True
					nb.back_ref = cur
					dq.append(nb)

		result = []

		if is_found:
			cur = tile_to

			while cur is not tile_from:
				result.append(cur)
				cur = cur.back_ref

		return result

It is a simple code that creates a trivial random map and can find a path in it. An important feature is that it contains a map of instances of size width x height. Working with these instances and accessing their properties is computationally intensive - although the complexity of such operations seems O(1), in reality all these calls in pure Python is more complex and it makes sense to optimize the classes completely.

Prerequisites:

  • Python coding
  • Knowledge of Cython tool
  • Programming in C/C++

There are three ways to create a class in Cython:

  • a) compiling classic python classes - starting with "class Cls" and importing with "import"
  • b) basic version of extension type - .pyx file with class labeled "cdef class Cls" and importing using "import"
  • c) complete extension type - a .pyx file with a .pxd header with the class labelled "cdef class Cls" and imported using "cimport"

For optimization, we are interested in the last option, which is the complete extension type including the header file - the compiled C code then also generates the data structure (typedef struct) and the corresponding Python-class instances can be type-casted to this structure and the attributes can be accessed directly. These instances can then be stored as weak references directly in the form of a C pointer - eliminating the overhead with reference-counting of the cpython interpreter.

The optimized code is below - with details in the comments:

gametile.pxd



cdef class GameTile:
	cdef:
		object _parent
		long _pos_x
		long _pos_y
		long _flag
		void **_neightbors

	cdef public:
		# public properties are possible
		int visited

		# public "void *" is prohibited - so we use numeric type
		size_t back_ref

	cdef long get_pos_x(self)

	cdef long get_pos_y(self)

	cdef long get_flag(self)

	cdef void ** get_neightbors(self)

	cdef long get_neightbors_count(self)

	cdef void update(self)

gametile.pyx

from libc.stdlib cimport malloc, realloc, free
from src.gamemap cimport GameMap


cdef class GameTile:
	def __cinit__(self, parent, long pos_x, long pos_y, long flag):
		self._parent = parent
		self._pos_x = pos_x
		self._pos_y = pos_y
		self._flag = flag

		# maximum of 8 neightbors
		self._neightbors = <void **>malloc(sizeof(void *) * 8)

		self.visited = False
		self.back_ref = <size_t>NULL

	def __dealloc__(self):
		# dont forget to dealloc when object is garbage-collected

		free(self._neightbors)

	# following cdef methods must have return type

	def get_coords(self):
		return self._pos_x, self._pos_y

	cdef long get_pos_x(self):
		return self._pos_x

	cdef long get_pos_y(self):
		return self._pos_y

	cdef long get_flag(self):
		return self._flag

	cdef void ** get_neightbors(self):
		return self._neightbors

	cdef long get_neightbors_count(self):
		return 8

	cdef void update(self):
		cdef long fx, fy, idx
		cdef GameTile tile

		idx = 0

		for fy in range(-1, 2):
			for fx in range(-1, 2):
				if fy != 0 or fx != 0:
					tile = (<GameMap>self._parent).get_tile(self._pos_x + fx, self._pos_y + fy)

					if tile is not None:
						# store raw pointer
						self._neightbors[idx] = <void *>tile

					else:
						# set NULL on unavailable tiles
						self._neightbors[idx] = NULL

					idx += 1

gamemap.pxd

from src.gametile cimport GameTile


cdef class GameMap:
	cdef:
		long _width
		long _height
		object _tiles
		void ** _area

	cdef GameTile get_tile(self, long pos_x, long pos_y)

	cdef void * get_tile_ptr(self, long pos_x, long pos_y)

	cdef object path(self, GameTile tile_from, GameTile tile_to)

gamemap.pyx

import random
from collections import deque
from libc.stdlib cimport malloc, realloc, free
from src.gametile cimport GameTile


cdef class GameMap:
	def __cinit__(self, long width, long height):
		cdef long fx, fy, flag
		cdef GameTile tile

		self._width = width
		self._height = height

		# native list of all tiles
		self._tiles = []

		# allocation of 1d array - coords of x,y coordinates will be transformated to index
		self._area = <void **>malloc(sizeof(void *) * self._width * self._height)

		for fy in range(0, self._height):
			for fx in range(0, self._width):
				# 9 of 10 tiles accessible

				flag = 1 if random.randint(0, 10) == 0 else 0
				tile = GameTile(self, fx, fy, flag)

				# the tile reference MUST BE STORED SOMEWHERE to be ref-counted - without that program will crash

				self._tiles.append(tile)

				# pointer for fast access to the instance

				self._area[fy * self._width + fx] = <void *>tile

		for fy in range(0, self._height):
			# fast access by index - instance must be type-casted from void * to GameTile

			[(<GameTile>self._area[fy * self._width + fx]).update() for fx in range(0, self._width)]

	def __dealloc__(self):
		# dont forget to dealloc when object is garbage-collected
		free(self._area)

	def get_tile_p(self, pos_x, pos_y):
		# returned GameTile which is castable to Python object
		# "def" function for access from Python (i dont like to use "cpdef"")
		return self.get_tile(pos_x, pos_y)

	def get_leaf_tile_p(self):
		return self.get_tile(self._width - 1, self._height - 1)

	def path_p(self, tile_from, tile_to):
		# input tiles as Python object are automatically casted to GameTile
		return self.path(tile_from, tile_to)

	cdef GameTile get_tile(self, long pos_x, long pos_y):
		# fast method to retrieve tile by coords
		if pos_x >= 0 and pos_y >= 0 and pos_x < self._width and pos_y < self._height:
			return <GameTile>self._area[pos_y * self._width + pos_x]
		else:
			return None

	cdef void * get_tile_ptr(self, long pos_x, long pos_y):
		# even faster method without type-casting - so python doesnt do inc-ref stuff
		# NOTE there are difference between "NULL" and "None" - "NULL" is zero pointer (zero value) and "None" is alive object with instance - its possible to typecast <void *>None which return non-zero number
		if pos_x >= 0 and pos_y >= 0 and pos_x < self._width and pos_y < self._height:
			return <void *>self._area[pos_y * self._width + pos_x] 
		else:
			return NULL

	cdef object path(self, GameTile tile_from, GameTile tile_to):
		cdef long fx, fy, idx
		cdef int is_found
		cdef object dq, result

		# c-array of neighbors
		cdef void ** nbs

		# exact types which will be automatically type-casted
		cdef GameTile cur, nb

		for fy in range(0, self._height):
			for fx in range(0, self._width):
				# access to array of pointers must be typecasted to class
				(<GameTile>self._area[fy * self._width + fx]).visited = False

		is_found = False

		dq = deque()
		dq.append(tile_from)

		while dq:
			# deque is standard python collection and returns generic object which is automatically type-casted because GameTile type of "cur"
			cur = dq.popleft()
			cur.visited = True

			if cur is tile_to:
				is_found = True
				break

			nbs = cur.get_neightbors()

			for idx in range(0, cur.get_neightbors_count()):
				# when accessing to property directly after typecast like "(<GameTile>nbs[idx]).visited" - the compiler does not do inc-ref and dec-ref on the instance

				if nbs[idx] == NULL:
					pass

				elif not (<GameTile>nbs[idx]).visited and (<GameTile>nbs[idx]).get_flag() == 0:
					# reference is inscreased here

					nb = <GameTile>nbs[idx]
					nb.visited = True

					# both casts needed
					nb.back_ref = <size_t>(<void *>cur)

					dq.append(nb)

		result = []

		if is_found:
			cur = tile_to

			while cur is not tile_from:
				result.append(cur)

				# back_ref is size_t so cast it to void * and then to GameTile
				cur = <GameTile>(<void *>cur.back_ref)

		return result

Finally, test both versions on a 2000x2000 map as follows:

from src.gamemap import GameMap
from gamemappy import GameMapPy


def main_py():
    m = GameMapPy(2000, 2000)
    tiles = m.path(m.get_tile(0, 0), m.get_leaf_tile())

    print("Path found with " + str(len(tiles)) + " steps!")


def main_c():
    m = GameMap(2000, 2000)
    tiles = m.path_p(m.get_tile_p(0, 0), m.get_leaf_tile_p())

    print("Path found with " + str(len(tiles)) + " steps!")


if __name__ == '__main__':
    main_py()
    #main_c()
                    

Pure Python version:

Path found with 2050 steps!

real    0m32,357s
user    0m31,234s
sys     0m1,120s
                    

and Cython version:

Path found with 2055 steps!

real    0m7,750s
user    0m7,433s
sys     0m0,312s
                    

Of course, there may be many ideas to improve this code - here is just a basic example with a few optimization ideas.

© 2024 Dzejkob games | info@dzejkobgames.eu | YouTube channel | Itch.io