diff --git a/TicTacToe/policy_p1 b/TicTacToe/policy_p1 index 72fbaba..6ccbd1d 100644 Binary files a/TicTacToe/policy_p1 and b/TicTacToe/policy_p1 differ diff --git a/TicTacToe/policy_p2 b/TicTacToe/policy_p2 index 9447c6e..ad5e7d7 100644 Binary files a/TicTacToe/policy_p2 and b/TicTacToe/policy_p2 differ diff --git a/TicTacToe/tic-tac-toe.ipynb b/TicTacToe/tic-tac-toe.ipynb index f2a6277..fd3c8da 100644 --- a/TicTacToe/tic-tac-toe.ipynb +++ b/TicTacToe/tic-tac-toe.ipynb @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 54, "metadata": {}, "outputs": [], "source": [ @@ -23,12 +23,13 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "BOARD_ROWS = 3\n", - "BOARD_COLS = 3" + "BOARD_COLS = 3\n", + "train = False" ] }, { @@ -44,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 56, "metadata": {}, "outputs": [], "source": [ @@ -83,11 +84,10 @@ " # diagonal\n", " diag_sum1 = sum([self.board[i, i] for i in range(BOARD_COLS)])\n", " diag_sum2 = sum([self.board[i, BOARD_COLS-i-1] for i in range(BOARD_COLS)])\n", - " diag_sum = max(diag_sum1, diag_sum2)\n", - " if diag_sum == 3:\n", + " if diag_sum1 == 3 or diag_sum2 == 3:\n", " self.isEnd = True\n", " return 1\n", - " if diag_sum == -3:\n", + " if diag_sum1 == -3 or diag_sum2 == -3:\n", " self.isEnd = True\n", " return -1\n", " \n", @@ -222,7 +222,10 @@ " if self.board[i, j] == -1:\n", " token = 'o'\n", " if self.board[i, j] == 0:\n", - " token = ' '\n", + " token = str(j+1 + i*3)\n", + "\n", + "\n", + "\n", " out += token + ' | '\n", " print(out)\n", " print('-------------') " @@ -230,7 +233,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 57, "metadata": {}, "outputs": [], "source": [ @@ -294,7 +297,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 58, "metadata": {}, "outputs": [], "source": [ @@ -302,13 +305,25 @@ " def __init__(self, name):\n", " self.name = name \n", " \n", + " def cell_to_row_col(self,cell_number):\n", + " if not 1 <= cell_number <= 9:\n", + " raise ValueError(\"Cell number must be between 1 and 9\")\n", + "\n", + " row = (cell_number - 1) // 3\n", + " col = (cell_number - 1) % 3\n", + " return row, col\n", + "\n", + " \n", " def chooseAction(self, positions):\n", " while True:\n", - " row = int(input(\"Input your action row:\"))\n", - " col = int(input(\"Input your action col:\"))\n", + " cell = int(input(\"Input your action:\"))\n", + " row,col = self.cell_to_row_col(cell)\n", " action = (row, col)\n", " if action in positions:\n", " return action\n", + " \n", + "\n", + "\n", " \n", " # append a hash state\n", " def addState(self, state):\n", @@ -331,91 +346,28 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 59, "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "training...\n", - "Rounds 0\n", - "Rounds 1000\n", - "Rounds 2000\n", - "Rounds 3000\n", - "Rounds 4000\n", - "Rounds 5000\n", - "Rounds 6000\n", - "Rounds 7000\n", - "Rounds 8000\n", - "Rounds 9000\n", - "Rounds 10000\n", - "Rounds 11000\n", - "Rounds 12000\n", - "Rounds 13000\n", - "Rounds 14000\n", - "Rounds 15000\n", - "Rounds 16000\n", - "Rounds 17000\n", - "Rounds 18000\n", - "Rounds 19000\n", - "Rounds 20000\n", - "Rounds 21000\n", - "Rounds 22000\n", - "Rounds 23000\n", - "Rounds 24000\n", - "Rounds 25000\n", - "Rounds 26000\n", - "Rounds 27000\n", - "Rounds 28000\n", - "Rounds 29000\n", - "Rounds 30000\n", - "Rounds 31000\n", - "Rounds 32000\n", - "Rounds 33000\n", - "Rounds 34000\n", - "Rounds 35000\n", - "Rounds 36000\n", - "Rounds 37000\n", - "Rounds 38000\n", - "Rounds 39000\n", - "Rounds 40000\n", - "Rounds 41000\n", - "Rounds 42000\n", - "Rounds 43000\n", - "Rounds 44000\n", - "Rounds 45000\n", - "Rounds 46000\n", - "Rounds 47000\n", - "Rounds 48000\n", - "Rounds 49000\n" - ] - } - ], + "outputs": [], "source": [ "p1 = Player(\"p1\")\n", "p2 = Player(\"p2\")\n", - "\n", "st = State(p1, p2)\n", - "print(\"training...\")\n", - "st.play(50000)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "p1.savePolicy()\n", - "p2.savePolicy()" + "\n", + "# If you want to re-train the agent change the variable at the beginning of the code to True\n", + "# The agent takes approximately 3 minutes to train\n", + "if train:\n", + " print(\"training...\")\n", + " st.play(50000)\n", + " p1.savePolicy()\n", + " p2.savePolicy()" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -431,7 +383,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 63, "metadata": {}, "outputs": [ { @@ -439,141 +391,62 @@ "output_type": "stream", "text": [ "-------------\n", - "| | | | \n", - "-------------\n", - "| | x | | \n", + "| x | 2 | 3 | \n", "-------------\n", - "| | | | \n", + "| 4 | 5 | 6 | \n", "-------------\n", - "Input your action row:2\n", - "Input your action col:2\n", + "| 7 | 8 | 9 | \n", "-------------\n", - "| | | | \n", "-------------\n", - "| | x | | \n", + "| x | 2 | 3 | \n", "-------------\n", - "| | | o | \n", + "| 4 | o | 6 | \n", "-------------\n", + "| 7 | 8 | 9 | \n", "-------------\n", - "| | | | \n", "-------------\n", - "| | x | | \n", + "| x | 2 | 3 | \n", "-------------\n", - "| | x | o | \n", + "| 4 | o | 6 | \n", "-------------\n", - "Input your action row:0\n", - "Input your action col:1\n", + "| 7 | 8 | x | \n", "-------------\n", - "| | o | | \n", "-------------\n", - "| | x | | \n", + "| x | 2 | 3 | \n", "-------------\n", - "| | x | o | \n", + "| 4 | o | 6 | \n", "-------------\n", + "| 7 | o | x | \n", "-------------\n", - "| | o | x | \n", "-------------\n", - "| | x | | \n", + "| x | x | 3 | \n", "-------------\n", - "| | x | o | \n", + "| 4 | o | 6 | \n", "-------------\n", - "Input your action row:1\n", - "Input your action col:1\n", - "Input your action row:1\n", - "Input your action col:0\n", + "| 7 | o | x | \n", "-------------\n", - "| | o | x | \n", - "-------------\n", - "| o | x | | \n", - "-------------\n", - "| | x | o | \n", - "-------------\n", - "-------------\n", - "| | o | x | \n", - "-------------\n", - "| o | x | | \n", "-------------\n", "| x | x | o | \n", "-------------\n", - "computer wins!\n" - ] - } - ], - "source": [ - "p1 = Player(\"computer\", exp_rate=0)\n", - "p1.loadPolicy(\"policy_p1\")\n", - "\n", - "p2 = HumanPlayer(\"human\")\n", - "\n", - "st = State(p1, p2)\n", - "st.play2()" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "-------------\n", - "| | | | \n", + "| 4 | o | 6 | \n", "-------------\n", - "| | | x | \n", + "| 7 | o | x | \n", "-------------\n", - "| | | | \n", "-------------\n", - "Input your action row:2\n", - "Input your action col:2\n", - "-------------\n", - "| | | | \n", - "-------------\n", - "| | | x | \n", - "-------------\n", - "| | | o | \n", - "-------------\n", - "-------------\n", - "| | | | \n", - "-------------\n", - "| | x | x | \n", - "-------------\n", - "| | | o | \n", - "-------------\n", - "Input your action row:1\n", - "Input your action col:0\n", - "-------------\n", - "| | | | \n", - "-------------\n", - "| o | x | x | \n", - "-------------\n", - "| | | o | \n", - "-------------\n", - "-------------\n", - "| | | | \n", - "-------------\n", - "| o | x | x | \n", - "-------------\n", - "| x | | o | \n", - "-------------\n", - "Input your action row:0\n", - "Input your action col:0\n", - "-------------\n", - "| o | | | \n", + "| x | x | o | \n", "-------------\n", - "| o | x | x | \n", + "| x | o | 6 | \n", "-------------\n", - "| x | | o | \n", + "| 7 | o | x | \n", "-------------\n", "-------------\n", - "| o | | x | \n", + "| x | x | o | \n", "-------------\n", - "| o | x | x | \n", + "| x | o | 6 | \n", "-------------\n", - "| x | | o | \n", + "| o | o | x | \n", "-------------\n", - "computer wins!\n" + "human wins!\n" ] } ], @@ -604,7 +477,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.13.5" } }, "nbformat": 4, diff --git a/TicTacToe/ticTacToe.py b/TicTacToe/ticTacToe.py index 5e18039..85ec08c 100644 --- a/TicTacToe/ticTacToe.py +++ b/TicTacToe/ticTacToe.py @@ -3,7 +3,7 @@ BOARD_ROWS = 3 BOARD_COLS = 3 - +train = False class State: def __init__(self, p1, p2): @@ -39,14 +39,22 @@ def winner(self): return -1 # diagonal diag_sum1 = sum([self.board[i, i] for i in range(BOARD_COLS)]) - diag_sum2 = sum([self.board[i, BOARD_COLS - i - 1] for i in range(BOARD_COLS)]) - diag_sum = max(abs(diag_sum1), abs(diag_sum2)) - if diag_sum == 3: + diag_sum2 = sum([self.board[i, BOARD_COLS-i-1] for i in range(BOARD_COLS)]) + if diag_sum1 == 3 or diag_sum2 == 3: self.isEnd = True - if diag_sum1 == 3 or diag_sum2 == 3: - return 1 - else: - return -1 + return 1 + if diag_sum1 == -3 or diag_sum2 == -3: + self.isEnd = True + return -1 + + # tie + # no available positions + if len(self.availablePositions()) == 0: + self.isEnd = True + return 0 + # not end + self.isEnd = False + return None # tie # no available positions @@ -179,10 +187,13 @@ def showBoard(self): if self.board[i, j] == -1: token = 'o' if self.board[i, j] == 0: - token = ' ' + token = str(j+1 + i*3) + + + out += token + ' | ' print(out) - print('-------------') + print('-------------') class Player: @@ -245,40 +256,57 @@ def loadPolicy(self, file): class HumanPlayer: def __init__(self, name): - self.name = name + self.name = name + + def cell_to_row_col(self,cell_number): + if not 1 <= cell_number <= 9: + raise ValueError("Cell number must be between 1 and 9") + + row = (cell_number - 1) // 3 + col = (cell_number - 1) % 3 + return row, col + def chooseAction(self, positions): while True: - row = int(input("Input your action row:")) - col = int(input("Input your action col:")) + cell = int(input("Input your action:")) + row,col = self.cell_to_row_col(cell) action = (row, col) if action in positions: return action + + + # append a hash state def addState(self, state): pass - + # at the end of game, backpropagate and update states value def feedReward(self, reward): pass - + def reset(self): pass if __name__ == "__main__": - # training + p1 = Player("p1") p2 = Player("p2") - st = State(p1, p2) - print("training...") - st.play(50000) + + # If you want to re-train the agent change the variable at the beginning of the code to True + # The agent takes approximately 3 minutes to train + if train: + print("training...") + st.play(5000) + p1.savePolicy() + p2.savePolicy() # play with human p1 = Player("computer", exp_rate=0) - p1.loadPolicy("policy_p1") + p1.loadPolicy("TicTacToe/policy_p1") p2 = HumanPlayer("human")