17.5.2023
Optimization of Python object structures using Cython(Python object data structures can be fast)
If you are building your project using object-oriented design, which includes many individual classes, you have probably encountered a speed problem from a certain point in the project. Since I've been dealing with this problem and it took me some time to find and test a workable solution, I'm writing this short article with a summary of the ways in which optimization can be done.
Compiling classes to C using Cython alone, although the class methods may be perfectly optimized, does not lead to the desired speedup. An example of this is large collections of instances with references to other instances, where without the necessary optimization, the use of object structures adds an order of magnitude and possibly more orders of complexity.
As an example, consider the following simple structure:
import random
from collections import deque
class GameTilePy:
_parent = None
_pos_x = 0
_pos_y = 0
_flag = 0
_neightbors = None
visited = False
back_ref = None
def __init__(self, parent, pos_x, pos_y, flag):
self._parent = parent
self._pos_x = pos_x
self._pos_y = pos_y
self._flag = flag
self._neightbors = []
def get_coords(self):
return self._pos_x, self._pos_y
def get_pos_x(self):
return self._pos_x
def get_pos_y(self):
return self._pos_y
def get_flag(self):
return self._flag
def get_neightbors(self):
return self._neightbors
def update(self):
self._neightbors = []
for fy in range(-1, 2):
for fx in range(-1, 2):
tile = self._parent.get_tile(self._pos_x + fx, self._pos_y + fy)
if tile is not None:
self._neightbors.append(tile)
class GameMapPy:
_width = 0
_height = 0
_area = []
def __init__(self, width, height):
self._width = width
self._height = height
self._area = []
for fy in range(0, self._height):
self._area.append( list(( GameTilePy(self, fx, fy, 1 if random.randint(0, 10) == 0 else 0) for fx in range(0, self._width) )) )
for fy in range(0, self._height):
[self._area[fy][fx].update() for fx in range(0, self._width)]
def get_tile(self, pos_x, pos_y):
return self._area[pos_y][pos_x] if pos_x >= 0 and pos_y >= 0 and pos_x < self._width and pos_y < self._height else None
def get_leaf_tile(self):
return self.get_tile(self._width - 1, self._height - 1)
def path(self, tile_from, tile_to):
for fy in range(0, self._height):
for fx in range(0, self._width):
self._area[fy][fx].visited = False
is_found = False
dq = deque()
dq.append(tile_from)
while dq:
cur = dq.popleft()
cur.visited = True
if cur is tile_to:
is_found = True
break
for nb in cur.get_neightbors():
if not nb.visited and nb.get_flag() == 0:
nb.visited = True
nb.back_ref = cur
dq.append(nb)
result = []
if is_found:
cur = tile_to
while cur is not tile_from:
result.append(cur)
cur = cur.back_ref
return result
It is a simple code that creates a trivial random map and can find a path in it. An important feature is that it contains a map of instances of size width x height. Working with these instances and accessing their properties is computationally intensive - although the complexity of such operations seems O(1), in reality all these calls in pure Python is more complex and it makes sense to optimize the classes completely.
Prerequisites:
There are three ways to create a class in Cython:
- a) compiling classic python classes - starting with "class Cls" and importing with "import"
- b) basic version of extension type - .pyx file with class labeled "cdef class Cls" and importing using "import"
- c) complete extension type - a .pyx file with a .pxd header with the class labelled "cdef class Cls" and imported using "cimport"
For optimization, we are interested in the last option, which is the complete extension type including the header file - the compiled C code then also generates the data structure (typedef struct) and the corresponding Python-class instances can be type-casted to this structure and the attributes can be accessed directly. These instances can then be stored as weak references directly in the form of a C pointer - eliminating the overhead with reference-counting of the cpython interpreter.
The optimized code is below - with details in the comments:
gametile.pxd
cdef class GameTile:
cdef:
object _parent
long _pos_x
long _pos_y
long _flag
void **_neightbors
cdef public:
# public properties are possible
int visited
# public "void *" is prohibited - so we use numeric type
size_t back_ref
cdef long get_pos_x(self)
cdef long get_pos_y(self)
cdef long get_flag(self)
cdef void ** get_neightbors(self)
cdef long get_neightbors_count(self)
cdef void update(self)
gametile.pyx
from libc.stdlib cimport malloc, realloc, free
from src.gamemap cimport GameMap
cdef class GameTile:
def __cinit__(self, parent, long pos_x, long pos_y, long flag):
self._parent = parent
self._pos_x = pos_x
self._pos_y = pos_y
self._flag = flag
# maximum of 8 neightbors
self._neightbors = <void **>malloc(sizeof(void *) * 8)
self.visited = False
self.back_ref = <size_t>NULL
def __dealloc__(self):
# dont forget to dealloc when object is garbage-collected
free(self._neightbors)
# following cdef methods must have return type
def get_coords(self):
return self._pos_x, self._pos_y
cdef long get_pos_x(self):
return self._pos_x
cdef long get_pos_y(self):
return self._pos_y
cdef long get_flag(self):
return self._flag
cdef void ** get_neightbors(self):
return self._neightbors
cdef long get_neightbors_count(self):
return 8
cdef void update(self):
cdef long fx, fy, idx
cdef GameTile tile
idx = 0
for fy in range(-1, 2):
for fx in range(-1, 2):
if fy != 0 or fx != 0:
tile = (<GameMap>self._parent).get_tile(self._pos_x + fx, self._pos_y + fy)
if tile is not None:
# store raw pointer
self._neightbors[idx] = <void *>tile
else:
# set NULL on unavailable tiles
self._neightbors[idx] = NULL
idx += 1
gamemap.pxd
from src.gametile cimport GameTile
cdef class GameMap:
cdef:
long _width
long _height
object _tiles
void ** _area
cdef GameTile get_tile(self, long pos_x, long pos_y)
cdef void * get_tile_ptr(self, long pos_x, long pos_y)
cdef object path(self, GameTile tile_from, GameTile tile_to)
gamemap.pyx
import random
from collections import deque
from libc.stdlib cimport malloc, realloc, free
from src.gametile cimport GameTile
cdef class GameMap:
def __cinit__(self, long width, long height):
cdef long fx, fy, flag
cdef GameTile tile
self._width = width
self._height = height
# native list of all tiles
self._tiles = []
# allocation of 1d array - coords of x,y coordinates will be transformated to index
self._area = <void **>malloc(sizeof(void *) * self._width * self._height)
for fy in range(0, self._height):
for fx in range(0, self._width):
# 9 of 10 tiles accessible
flag = 1 if random.randint(0, 10) == 0 else 0
tile = GameTile(self, fx, fy, flag)
# the tile reference MUST BE STORED SOMEWHERE to be ref-counted - without that program will crash
self._tiles.append(tile)
# pointer for fast access to the instance
self._area[fy * self._width + fx] = <void *>tile
for fy in range(0, self._height):
# fast access by index - instance must be type-casted from void * to GameTile
[(<GameTile>self._area[fy * self._width + fx]).update() for fx in range(0, self._width)]
def __dealloc__(self):
# dont forget to dealloc when object is garbage-collected
free(self._area)
def get_tile_p(self, pos_x, pos_y):
# returned GameTile which is castable to Python object
# "def" function for access from Python (i dont like to use "cpdef"")
return self.get_tile(pos_x, pos_y)
def get_leaf_tile_p(self):
return self.get_tile(self._width - 1, self._height - 1)
def path_p(self, tile_from, tile_to):
# input tiles as Python object are automatically casted to GameTile
return self.path(tile_from, tile_to)
cdef GameTile get_tile(self, long pos_x, long pos_y):
# fast method to retrieve tile by coords
if pos_x >= 0 and pos_y >= 0 and pos_x < self._width and pos_y < self._height:
return <GameTile>self._area[pos_y * self._width + pos_x]
else:
return None
cdef void * get_tile_ptr(self, long pos_x, long pos_y):
# even faster method without type-casting - so python doesnt do inc-ref stuff
# NOTE there are difference between "NULL" and "None" - "NULL" is zero pointer (zero value) and "None" is alive object with instance - its possible to typecast <void *>None which return non-zero number
if pos_x >= 0 and pos_y >= 0 and pos_x < self._width and pos_y < self._height:
return <void *>self._area[pos_y * self._width + pos_x]
else:
return NULL
cdef object path(self, GameTile tile_from, GameTile tile_to):
cdef long fx, fy, idx
cdef int is_found
cdef object dq, result
# c-array of neighbors
cdef void ** nbs
# exact types which will be automatically type-casted
cdef GameTile cur, nb
for fy in range(0, self._height):
for fx in range(0, self._width):
# access to array of pointers must be typecasted to class
(<GameTile>self._area[fy * self._width + fx]).visited = False
is_found = False
dq = deque()
dq.append(tile_from)
while dq:
# deque is standard python collection and returns generic object which is automatically type-casted because GameTile type of "cur"
cur = dq.popleft()
cur.visited = True
if cur is tile_to:
is_found = True
break
nbs = cur.get_neightbors()
for idx in range(0, cur.get_neightbors_count()):
# when accessing to property directly after typecast like "(<GameTile>nbs[idx]).visited" - the compiler does not do inc-ref and dec-ref on the instance
if nbs[idx] == NULL:
pass
elif not (<GameTile>nbs[idx]).visited and (<GameTile>nbs[idx]).get_flag() == 0:
# reference is inscreased here
nb = <GameTile>nbs[idx]
nb.visited = True
# both casts needed
nb.back_ref = <size_t>(<void *>cur)
dq.append(nb)
result = []
if is_found:
cur = tile_to
while cur is not tile_from:
result.append(cur)
# back_ref is size_t so cast it to void * and then to GameTile
cur = <GameTile>(<void *>cur.back_ref)
return result
Finally, test both versions on a 2000x2000 map as follows:
from src.gamemap import GameMap
from gamemappy import GameMapPy
def main_py():
m = GameMapPy(2000, 2000)
tiles = m.path(m.get_tile(0, 0), m.get_leaf_tile())
print("Path found with " + str(len(tiles)) + " steps!")
def main_c():
m = GameMap(2000, 2000)
tiles = m.path_p(m.get_tile_p(0, 0), m.get_leaf_tile_p())
print("Path found with " + str(len(tiles)) + " steps!")
if __name__ == '__main__':
main_py()
#main_c()
Pure Python version:
Path found with 2050 steps! real 0m32,357s user 0m31,234s sys 0m1,120s
and Cython version:
Path found with 2055 steps! real 0m7,750s user 0m7,433s sys 0m0,312s
Of course, there may be many ideas to improve this code - here is just a basic example with a few optimization ideas.
© 2024 Dzejkob games | info@dzejkobgames.eu | YouTube channel | Itch.io